In [None]:
#| default_exp data.utils

In [None]:
from airt.testing import activate_by_import

[INFO] airt.testing.activate_by_import: Testing environment activated.
[INFO] numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] numexpr.utils: NumExpr defaulting to 8 threads.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
#| export

import re
from datetime import datetime
from typing import *
from urllib.parse import unquote_plus as urlunquote

import dask.dataframe as dd
from mypy_boto3_s3.service_resource import Bucket

import airt_service.sanitizer
from airt.logger import get_logger
from airt_service.db.models import (
    create_connection_string,
    DataSource,
    DataBlob,
    Model,
    Prediction,
)
from airt_service.aws.utils import (
    create_s3_datablob_path,
    create_s3_datasource_path,
    get_s3_bucket_and_path_from_uri,
    get_s3_bucket_name_and_folder_from_uri,
)
from airt_service.azure.utils import (
    create_azure_blob_storage_datablob_path,
    create_azure_blob_storage_datasource_path,
    get_azure_blob_storage_container,
)
from airt_service.constants import METADATA_FOLDER_PATH

In [None]:
import json
import os
import shutil
from time import sleep

from sqlmodel import select
import pandas as pd

from airt.remote_path import RemotePath
from airt_service.aws.utils import upload_to_s3_with_retry
from airt_service.data.csv import process_csv
from airt_service.data.datablob import FromLocalRequest, from_local_start_route
from airt_service.db.models import (
    get_session,
    get_session_with_context,
    User,
    create_user_for_testing,
)

[INFO] airt.data.importers: Module loaded:
[INFO] airt.data.importers:  - using pandas     : 1.5.1
[INFO] airt.data.importers:  - using dask       : 2022.10.0
[INFO] airt.executor.subcommand: Module loaded.


In [None]:
test_username = create_user_for_testing()
display(test_username)

'hdgdzpegvs'

In [None]:
#| exporti

logger = get_logger(__name__)

In [None]:
#| export


def create_db_uri_for_s3_datablob(uri: str, access_key: str, secret_key: str) -> str:
    """Create db_uri for s3 datablob based on s3 connection params

    Args:
        uri: URI of s3 datablob
        access_key: Access key of s3 datablob
        secret_key: Secret key of s3 datablob

    Returns:
        The uri for the s3 datablob
    """
    return f"s3://{access_key}:{secret_key}@{uri.replace('s3://', '')}"

In [None]:
s3_test_cases = [
    {
        "uri": "s3://bucket/hello/world",
        "access_key": "AIKEA987SDFADF",
        "secret_key": "9UVSadsalfajJJHGUIYGjhsdfaf+0",
        "db_uri": "s3://AIKEA987SDFADF:9UVSadsalfajJJHGUIYGjhsdfaf+0@bucket/hello/world",
    },
    {
        "uri": "s3://test-airt-service/account_312571_events",
        "access_key": "AIKEA987SDFADF",
        "secret_key": "9UVSadsalfajJJHGUIYGjhsdfaf+0",
        "db_uri": "s3://AIKEA987SDFADF:9UVSadsalfajJJHGUIYGjhsdfaf+0@test-airt-service/account_312571_events",
    },
    {
        "uri": "s3://bucket/hello/world/again",
        "access_key": "AIKEA987SDFADF",
        "secret_key": "9UVSadsalfajJJHGUIYGjhsdfaf+0",
        "db_uri": "s3://AIKEA987SDFADF:9UVSadsalfajJJHGUIYGjhsdfaf+0@bucket/hello/world/again",
    },
    {
        "uri": "s3://bucket/hello/world?qwe1=3#ddd",
        "access_key": "AIKEA987SDFADF",
        "secret_key": "9UVSadsalfajJJHGUIYGjhsdfaf+0",
        "db_uri": "s3://AIKEA987SDFADF:9UVSadsalfajJJHGUIYGjhsdfaf+0@bucket/hello/world?qwe1=3#ddd",
    },
    {
        "uri": "s3://bucket/hello/world#foo?bar=2",
        "access_key": "AIKEA987SDFADF",
        "secret_key": "9UVSadsalfajJJHGUIYGjhsdfaf+0",
        "db_uri": "s3://AIKEA987SDFADF:9UVSadsalfajJJHGUIYGjhsdfaf+0@bucket/hello/world#foo?bar=2",
    },
]

for test_case in s3_test_cases:
    actual_db_uri = create_db_uri_for_s3_datablob(
        uri=test_case["uri"],
        access_key=test_case["access_key"],
        secret_key=test_case["secret_key"],
    )
    display(f"{actual_db_uri=}")
    assert actual_db_uri == test_case["db_uri"]

"actual_db_uri='s3://****************************************@bucket/hello/world'"

"actual_db_uri='s3://****************************************@test-airt-service/account_312571_events'"

"actual_db_uri='s3://****************************************@bucket/hello/world/again'"

"actual_db_uri='s3://****************************************@bucket/hello/world?qwe1=3#ddd'"

"actual_db_uri='s3://****************************************@bucket/hello/world#foo?bar=2'"

In [None]:
#| export


def get_s3_connection_params_from_db_uri(db_uri: str) -> Tuple[str, str, str]:
    """Get S3 connection params from db_uri of the s3 datablob

    Args:
        db_uri: DB uri of s3 datablob

    Returns:
        The uri, access key and secret key of the s3 datablob as a tuple
    """
    result = re.search("s3:\/\/(.*):(.*)@(.*)", db_uri)
    access_key = result.group(1)  # type: ignore
    secret_key = result.group(2)  # type: ignore
    uri = f"s3://{result.group(3)}"  # type: ignore
    return uri, access_key, secret_key

In [None]:
for test_case in s3_test_cases:
    (
        actual_uri,
        actual_access_key,
        actual_secret_key,
    ) = get_s3_connection_params_from_db_uri(db_uri=test_case["db_uri"])
    display(f"{actual_uri=}", f"{actual_access_key=}", f"{actual_secret_key=}")

    assert actual_uri == test_case["uri"]
    assert actual_access_key == test_case["access_key"]
    assert actual_secret_key == test_case["secret_key"]

"actual_uri='s3://bucket/hello/world'"

"actual_access_key = '****************************************'"

"actual_secret_key = '****************************************'"

"actual_uri='s3://test-airt-service/account_312571_events'"

"actual_access_key = '****************************************'"

"actual_secret_key = '****************************************'"

"actual_uri='s3://bucket/hello/world/again'"

"actual_access_key = '****************************************'"

"actual_secret_key = '****************************************'"

"actual_uri='s3://bucket/hello/world?qwe1=3#ddd'"

"actual_access_key = '****************************************'"

"actual_secret_key = '****************************************'"

"actual_uri='s3://bucket/hello/world#foo?bar=2'"

"actual_access_key = '****************************************'"

"actual_secret_key = '****************************************'"

In [None]:
#| export


def create_db_uri_for_azure_blob_storage_datablob(uri: str, credential: str) -> str:
    """Create db_uri for azure datablob based on azure blob storage connection params

    Args:
        uri: URI of azure blob storage datablob
        credential: Credential of azure blob storage datablob

    Returns:
        The uri for the azure blob storage datablob
    """
    return f"https://{credential}@{uri.replace('https://', '')}"

In [None]:
azure_test_cases = [
    {
        "uri": "https://testairtservice.blob.core.windows.net/test-container/account_312571_events",
        "credential": "xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q==",
        "db_uri": "https://xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q==@testairtservice.blob.core.windows.net/test-container/account_312571_events",
    },
    {
        "uri": "https://testairtservice.blob.core.windows.net/test-container/account_312571_events/folder",
        "credential": "xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q==",
        "db_uri": "https://xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q==@testairtservice.blob.core.windows.net/test-container/account_312571_events/folder",
    },
    {
        "uri": "https://testairtservice.blob.core.windows.net/test-container",
        "credential": "xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q==",
        "db_uri": "https://xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q==@testairtservice.blob.core.windows.net/test-container",
    },
]

for test_case in azure_test_cases:
    actual_db_uri = create_db_uri_for_azure_blob_storage_datablob(
        uri=test_case["uri"],
        credential=test_case["credential"],
    )
    display(f"{actual_db_uri=}")
    assert actual_db_uri == test_case["db_uri"]

"actual_db_uri='https://****************************************@testairtservice.blob.core.windows.net/test-container/account_312571_events'"

"actual_db_uri='https://****************************************@testairtservice.blob.core.windows.net/test-container/account_312571_events/folder'"

"actual_db_uri='https://****************************************@testairtservice.blob.core.windows.net/test-container'"

In [None]:
#| export


def get_azure_blob_storage_connection_params_from_db_uri(
    db_uri: str,
) -> Tuple[str, str]:
    """Get azure blob storage connection params from db_uri of the azure blob storage datablob

    Args:
        db_uri: DB uri of azure blob storage datablob

    Returns:
        The uri and credential of the azure blob storage datablob as a tuple
    """
    result = re.search("https:\/\/(.*)@(.*)", db_uri)
    credential = result.group(1)  # type: ignore
    uri = f"https://{result.group(2)}"  # type: ignore
    return uri, credential

In [None]:
for test_case in azure_test_cases:
    (
        actual_uri,
        actual_credential,
    ) = get_azure_blob_storage_connection_params_from_db_uri(db_uri=test_case["db_uri"])
    display(f"{actual_uri=}", f"{actual_credential=}")

    assert actual_uri == test_case["uri"]
    assert actual_credential == test_case["credential"]

"actual_uri='https://testairtservice.blob.core.windows.net/test-container/account_312571_events'"

"actual_credential='xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q=='"

"actual_uri='https://testairtservice.blob.core.windows.net/test-container/account_312571_events/folder'"

"actual_credential='xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q=='"

"actual_uri='https://testairtservice.blob.core.windows.net/test-container'"

"actual_credential='xFLcltokRem1ADQaM4PP81XXkmvb21rZQhUqbo3C4RjIG4yeMneOJLLc9AWQOa9LeNLH6EuMLe4H+ALp7kFM+Q=='"

In [None]:
#| export


def create_db_uri_for_db_datablob(
    username: str,
    password: str,
    host: str,
    port: int,
    table: str,
    database: str,
    database_server: str,
) -> str:
    """Create db_uri for the datablob based on connection params

    Args:
        username: Username of db datablob
        password: Password of db datablob
        host: Host of db datablob
        port: Port of db datablob
        table: Table of db datablob
        database: Database to use
        database_server: Server/engine of db datablob
    Returns:
        The db_uri for the db datasource
    """
    db_uri = create_connection_string(
        username=username,
        password=password,
        host=host,
        port=port,
        database=database,
        database_server=database_server,
    )
    db_uri = f"{db_uri}/{table}"
    return db_uri

In [None]:
db_test_cases = [
    dict(
        username="johndoe",
        password="special1@",
        host="db.example.com",
        port=3306,
        table="events",
        database="airt_service",
        database_server="mysql",
        db_uri="mysql://johndoe:special1%40@db.example.com:3306/airt_service/events",
    )
]

for test_case in db_test_cases:
    actual_db_uri = create_db_uri_for_db_datablob(
        username=test_case["username"],
        password=test_case["password"],
        host=test_case["host"],
        port=test_case["port"],
        table=test_case["table"],
        database=test_case["database"],
        database_server=test_case["database_server"],
    )
    display(f"{actual_db_uri=}")
    assert actual_db_uri == test_case["db_uri"]

"actual_db_uri='mysql://****************************************@db.example.com:3306/airt_service/events'"

In [None]:
#| export


def get_db_connection_params_from_db_uri(
    db_uri: str,
) -> Tuple[str, str, str, int, str, str, str]:
    """Get db connection params from db_uri

    Args:
        db_uri: DB uri of db datablob
    Returns:
        The username, password, host, port, table, database, database_server of the datablob as a tuple
    """
    result = re.search("(.*):\/\/(.*):(.*)@(.*):(.*)\/(.*)\/(.*)", db_uri)
    database_server = result.group(1)  # type: ignore
    username = result.group(2)  # type: ignore
    password = urlunquote(result.group(3))  # type: ignore
    host = result.group(4)  # type: ignore
    port = int(result.group(5))  # type: ignore
    database = result.group(6)  # type: ignore
    table = result.group(7)  # type: ignore
    return username, password, host, port, table, database, database_server

In [None]:
for test_case in db_test_cases:
    (
        actual_username,
        actual_password,
        actual_host,
        actual_port,
        actual_table,
        actual_database,
        actual_database_server,
    ) = get_db_connection_params_from_db_uri(db_uri=test_case["db_uri"])
    display(
        f"{actual_username=}",
        f"{actual_password=}",
        f"{actual_host=}",
        f"{actual_port=}",
        f"{actual_table=}",
        f"{actual_database=}",
        f"{actual_database_server=}",
    )

    assert actual_username == test_case["username"]
    assert actual_password == test_case["password"]
    assert actual_host == test_case["host"]
    assert actual_port == test_case["port"]
    assert actual_table == test_case["table"]
    assert actual_database == test_case["database"]
    assert actual_database_server == test_case["database_server"]

"actual_username='johndoe'"

"actual_password = '****************************************'"

"actual_host='db.example.com'"

'actual_port=3306'

"actual_table='events'"

"actual_database='airt_service'"

"actual_database_server='mysql'"

In [None]:
#| export


def create_db_uri_for_local_datablob(bucket: Bucket, s3_path: str) -> str:
    """Create db_uri for csv datablob

    Args:
        bucket: S3 bucket object
        s3_path: S3 path in which uploaded csv is stored

    Returns:
        The db uri for the csv datablob
    """
    return f"s3://{bucket.name}/{s3_path}"

In [None]:
bucket, s3_path = create_s3_datablob_path(
    user_id=999, datablob_id=999, region="eu-west-1"
)

actual = create_db_uri_for_local_datablob(bucket=bucket, s3_path=s3_path)
display(actual)
assert actual == f"s3://{bucket.name}/{s3_path}"

[INFO] botocore.credentials: Found credentials in environment variables.


's3://kumaran-airt-service-eu-west-1/999/datablob/999'

In [None]:
#| export


def calculate_azure_data_object_folder_size_and_path(
    data_object: Union[DataBlob, DataSource]
):
    """Calculate datasource/datablob folder size based on azure blob storage object size and its path

    Args:
        data_object: DataBlob or DataSource object
    """
    if isinstance(data_object, DataBlob):
        (
            container_client,
            azure_blob_storage_path,
        ) = create_azure_blob_storage_datablob_path(
            user_id=data_object.user.id, datablob_id=data_object.id, region=data_object.region  # type: ignore
        )
    elif isinstance(data_object, DataSource):
        (
            container_client,
            azure_blob_storage_path,
        ) = create_azure_blob_storage_datasource_path(
            user_id=data_object.user.id, datasource_id=data_object.id, region=data_object.region  # type: ignore
        )
    destination_container_objects = list(
        container_client.list_blobs(name_starts_with=azure_blob_storage_path + "/")
    )
    data_object.completed_steps = 1
    data_object.folder_size = sum(
        obj["size"]
        for obj in destination_container_objects
        if METADATA_FOLDER_PATH not in obj["name"]
    )
    data_object.path = f"{container_client.url}/{azure_blob_storage_path}"  # type: ignore

In [None]:
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    datasource = DataSource(
        cloud_provider="azure",
        region="westeurope",
        total_steps=1,
        user=user,
    )

    session.add(datasource)
    session.commit()
    session.refresh(datasource)

    assert not datasource.folder_size
    assert not datasource.no_of_rows
    assert not datasource.path
    assert not datasource.hash

    with RemotePath.from_url(
        remote_url="https://testairtservice.blob.core.windows.net/test-container/account_312571_events",
        pull_on_enter=True,
        push_on_exit=False,
        exist_ok=True,
        parents=False,
    ) as test_s3_path:
        (
            destination_container_client,
            azure_blob_storage_path,
        ) = create_azure_blob_storage_datasource_path(
            user_id=datasource.user.id,
            datasource_id=datasource.id,
            region=datasource.region,
        )
        sleep(10)
        with RemotePath.from_url(
            remote_url=f"{destination_container_client.url}/{azure_blob_storage_path}",
            pull_on_enter=False,
            push_on_exit=True,
            exist_ok=True,
            parents=True,
        ) as destination_s3_path:
            ddf = dd.read_parquet(test_s3_path.as_path()).set_index("PersonId")
            ddf.to_parquet(destination_s3_path.as_path())

    calculate_azure_data_object_folder_size_and_path(data_object=datasource)

    assert datasource.folder_size
    assert datasource.completed_steps == 1
    assert (
        f"westeurope/{user.id}/datasource/{datasource.id}" in datasource.path
    ), datasource.path
    datasource

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url https://testairtservice.blob.core.windows.net/test-container/account_312571_events
[INFO] airt.remote_path: AzureBlobPath._create_cache_path(): created cache path: /tmp/httpstestairtserviceblobcorewindowsnettest-containeraccount_312571_events_cached_6owgds7v
[INFO] airt.remote_path: AzureBlobPath.__init__(): created object for accessing https://testairtservice.blob.core.windows.net/test-container/account_312571_events locally in /tmp/httpstestairtserviceblobcorewindowsnettest-containeraccount_312571_events_cached_6owgds7v
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] airt.remote_path: AzureBlobPath.__enter__(): pulling data from https://testairtservice.blob.core.windows.net/test-container/account_312571_events to /tmp/httpstestairtserviceblobc

In [None]:
#| export


def calculate_s3_data_object_folder_size_and_path(
    data_object: Union[DataBlob, DataSource]
):
    """Calculate datasource/datablob folder size based on s3 object size and its s3 path

    Args:
        data_object: DataBlob or DataSource object
    """
    if isinstance(data_object, DataBlob):
        destination_bucket, s3_path = create_s3_datablob_path(
            user_id=data_object.user.id, datablob_id=data_object.id, region=data_object.region  # type: ignore
        )
    elif isinstance(data_object, DataSource):
        destination_bucket, s3_path = create_s3_datasource_path(
            user_id=data_object.user.id, datasource_id=data_object.id, region=data_object.region  # type: ignore
        )
    destination_bucket_objects = list(
        destination_bucket.objects.filter(Prefix=s3_path + "/")
    )
    data_object.completed_steps = 1
    data_object.folder_size = sum(
        obj.size
        for obj in destination_bucket_objects
        if METADATA_FOLDER_PATH not in obj.key
    )
    data_object.path = f"s3://{destination_bucket.name}/{s3_path}"  # type: ignore

In [None]:
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    datasource = DataSource(
        cloud_provider="aws",
        region="eu-west-1",
        total_steps=1,
        user=user,
    )

    session.add(datasource)
    session.commit()
    session.refresh(datasource)

    assert not datasource.folder_size
    assert not datasource.no_of_rows
    assert not datasource.path
    assert not datasource.hash

    with RemotePath.from_url(
        remote_url=f"s3://test-airt-service/account_312571_events",
        pull_on_enter=True,
        push_on_exit=False,
        exist_ok=True,
        parents=False,
        access_key=os.environ["AWS_ACCESS_KEY_ID"],
        secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    ) as test_s3_path:
        destination_bucket, s3_path = create_s3_datasource_path(
            user_id=datasource.user.id,
            datasource_id=datasource.id,
            region=datasource.region,
        )
        sleep(10)
        with RemotePath.from_url(
            remote_url=f"s3://{destination_bucket.name}/{s3_path}",
            pull_on_enter=False,
            push_on_exit=True,
            exist_ok=True,
            parents=True,
        ) as destination_s3_path:
            ddf = dd.read_parquet(test_s3_path.as_path()).set_index("PersonId")
            ddf.to_parquet(destination_s3_path.as_path())

    calculate_s3_data_object_folder_size_and_path(data_object=datasource)

    assert datasource.folder_size
    assert datasource.completed_steps == 1
    assert (
        datasource.path
        == f"s3://{os.environ['STORAGE_BUCKET_PREFIX']}-eu-west-1/{user.id}/datasource/{datasource.id}"
    ), datasource.path
    display(datasource)
    datasource_id = datasource.id

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-serviceaccount_312571_events_cached_qdgh24xo
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://test-airt-service/account_312571_events locally in /tmp/s3test-airt-serviceaccount_312571_events_cached_qdgh24xo
[INFO] airt.remote_path: S3Path.__enter__(): pulling data from s3://test-airt-service/account_312571_events to /tmp/s3test-airt-serviceaccount_312571_events_cached_qdgh24xo
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/7/datasource/4
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-17datasource4_cached_x6eax_y2
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://ku

DataSource(id=4, uuid=UUID('58852fab-236f-4360-9d95-f9b733dad8ec'), hash=None, total_steps=1, completed_steps=1, folder_size=8153852, no_of_rows=None, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path='s3://kumaran-airt-service-eu-west-1/7/datasource/4', created=datetime.datetime(2022, 11, 7, 9, 10, 46), user_id=7, pulled_on=None, tags=[])

In [None]:
#| export


def calculate_data_object_folder_size_and_path(
    data_object: Union[DataBlob, DataSource]
):
    """Calculate datasource/datablob folder size for both aws and azure data objects

    Args:
        data_object: DataBlob or DataSource object
    """
    if data_object.cloud_provider == "aws":
        calculate_s3_data_object_folder_size_and_path(data_object=data_object)
    elif data_object.cloud_provider == "azure":
        calculate_azure_data_object_folder_size_and_path(data_object=data_object)

In [None]:
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    datasource = DataSource(
        cloud_provider="aws",
        region="eu-west-1",
        total_steps=1,
        user=user,
    )

    session.add(datasource)
    session.commit()
    session.refresh(datasource)

    assert not datasource.folder_size
    assert not datasource.no_of_rows
    assert not datasource.path
    assert not datasource.hash

    with RemotePath.from_url(
        remote_url=f"s3://test-airt-service/account_312571_events",
        pull_on_enter=True,
        push_on_exit=False,
        exist_ok=True,
        parents=False,
        access_key=os.environ["AWS_ACCESS_KEY_ID"],
        secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    ) as test_s3_path:
        destination_bucket, s3_path = create_s3_datasource_path(
            user_id=datasource.user.id,
            datasource_id=datasource.id,
            region=datasource.region,
        )
        sleep(10)
        with RemotePath.from_url(
            remote_url=f"s3://{destination_bucket.name}/{s3_path}",
            pull_on_enter=False,
            push_on_exit=True,
            exist_ok=True,
            parents=True,
        ) as destination_s3_path:
            ddf = dd.read_parquet(test_s3_path.as_path()).set_index("PersonId")
            ddf.to_parquet(destination_s3_path.as_path())

    calculate_data_object_folder_size_and_path(data_object=datasource)

    assert datasource.folder_size
    assert datasource.completed_steps == 1
    assert (
        datasource.path
        == f"s3://{os.environ['STORAGE_BUCKET_PREFIX']}-eu-west-1/{user.id}/datasource/{datasource.id}"
    ), datasource.path
    display(datasource)
    datasource_id = datasource.id

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-serviceaccount_312571_events_cached_m037i2ul
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://test-airt-service/account_312571_events locally in /tmp/s3test-airt-serviceaccount_312571_events_cached_m037i2ul
[INFO] airt.remote_path: S3Path.__enter__(): pulling data from s3://test-airt-service/account_312571_events to /tmp/s3test-airt-serviceaccount_312571_events_cached_m037i2ul
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/7/datasource/6
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-17datasource6_cached_6xhxozi1
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://ku

DataSource(id=6, uuid=UUID('b2e4b635-f674-49c0-8997-8725f5e3934a'), hash=None, total_steps=1, completed_steps=1, folder_size=8158102, no_of_rows=None, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path='s3://kumaran-airt-service-eu-west-1/7/datasource/6', created=datetime.datetime(2022, 11, 7, 9, 11, 11), user_id=7, pulled_on=None, tags=[])

In [None]:
#| export


def calculate_data_object_pulled_on(data_object: Union[DataBlob, DataSource]):
    """Calculate datasource/datablob's pulled_on datetime

    Args:
        data_object: DataBlob or DataSource object
    """
    data_object.pulled_on = datetime.utcnow()

In [None]:
with get_session_with_context() as session:
    datasource = session.exec(
        select(DataSource).where(DataSource.id == datasource_id)
    ).one()
    calculate_data_object_pulled_on(data_object=datasource)

    display(datasource)
    assert isinstance(datasource.pulled_on, datetime)

DataSource(id=6, uuid=UUID('b2e4b635-f674-49c0-8997-8725f5e3934a'), hash=None, total_steps=1, completed_steps=0, folder_size=None, no_of_rows=None, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path=None, created=datetime.datetime(2022, 11, 7, 9, 11, 11), user_id=7, pulled_on=datetime.datetime(2022, 11, 7, 9, 11, 39, 779266), tags=[])

In [None]:
#| export


def delete_data_object_files_in_cloud(
    data_object: Union[DataBlob, DataSource, Model, Prediction]
):
    """
    Delete files for data object stored in cloud - aws or azure

    Args:
        data_object: object of type DataBlob, DataSource, Model, Prediction
    """

    if (
        data_object.completed_steps != data_object.total_steps
        or data_object.disabled == True
    ):
        return

    if data_object.cloud_provider == "aws":
        bucket, s3_path = get_s3_bucket_and_path_from_uri(data_object.path)  # type: ignore
        bucket.objects.filter(Prefix=s3_path + "/").delete()
    elif data_object.cloud_provider == "azure":
        container_client, _ = get_azure_blob_storage_container(
            region=data_object.region
        )
        blob_folder = "/".join(
            get_s3_bucket_name_and_folder_from_uri(data_object.path)[1].split("/")[1:]
        )
        for blob in container_client.list_blobs(name_starts_with=blob_folder + "/"):
            container_client.delete_blob(blob)

In [None]:
# Create and pull datablob, datasource to use in following tests
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()

    from_local_request = FromLocalRequest(
        path="tmp/test-folder/", tag="my_csv_datasource_tag"
    )
    from_local_response = from_local_start_route(
        from_local_request=from_local_request,
        user=user,
        session=session,
    )

    with RemotePath.from_url(
        remote_url=f"s3://test-airt-service/account_312571_events",
        pull_on_enter=True,
        push_on_exit=False,
        exist_ok=True,
        parents=False,
        access_key=os.environ["AWS_ACCESS_KEY_ID"],
        secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    ) as test_s3_path:
        df = pd.read_parquet(test_s3_path.as_path())
        display(df.head())
        df.to_csv(test_s3_path.as_path() / "file.csv", index=False)
        display(list(test_s3_path.as_path().glob("*")))
        !head -n 10 {test_s3_path.as_path()/"file.csv"}

        upload_to_s3_with_retry(
            test_s3_path.as_path() / "file.csv",
            from_local_response.presigned["url"],
            from_local_response.presigned["fields"],
        )

    datablob_id = session.exec(select(DataBlob).where(DataBlob.uuid == from_local_response.uuid)).one().id
    datasource = DataSource(
        datablob_id=datablob_id,
        cloud_provider="aws",
        region="eu-west-1",
        total_steps=1,
        user=user,
    )
    session.add(datasource)
    session.commit()

    process_csv(
        datablob_id=datablob_id,
        datasource_id=datasource.id,
        deduplicate_data=True,
        index_column="PersonId",
        sort_by="OccurredTime",
        blocksize="256MB",
        kwargs_json=json.dumps(
            dict(
                usecols=[0, 1, 2, 3, 4],
                parse_dates=["OccurredTime"],
            )
        ),
    )

datasource_id = datasource.id

[INFO] airt_service.data.datablob: DataBlob.from_local(): FromLocalResponse(uuid=UUID('281b0e1d-4b09-4bc0-8540-b123c9c74d5a'), type='local', presigned={'url': 'https://kumaran-airt-service-eu-west-1.s3.amazonaws.com/', 'fields': {'key': '****************************************', 'x-amz-algorithm': 'AWS4-HMAC-SHA256', 'x-amz-credential': '********************/20221107/eu-west-1/s3/aws4_request', 'x-amz-date': '20221107T091140Z', 'policy': '************************************************************************************************************************************************************************************************************************************************************', 'x-amz-signature': '****************************'}})
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-serviceaccount_31257

Unnamed: 0_level_0,AccountId,DefinitionId,OccurredTime,OccurredTimeTicks,PersonId
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,312571,loadTests2,2019-12-31 21:30:02,1577836802678,2
1,312571,loadTests3,2020-01-03 23:53:22,1578104602678,2
2,312571,loadTests1,2020-01-07 02:16:42,1578372402678,2
3,312571,loadTests2,2020-01-10 04:40:02,1578640202678,2
4,312571,loadTests3,2020-01-13 07:03:22,1578908002678,2


[Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym/_metadata'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym/_common_metadata'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym/file.csv'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym/part.3.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym/part.0.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym/part.1.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym/part.4.parquet'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym/part.2.parquet')]

AccountId,DefinitionId,OccurredTime,OccurredTimeTicks,PersonId
312571,loadTests2,2019-12-31 21:30:02,1577836802678,2
312571,loadTests3,2020-01-03 23:53:22,1578104602678,2
312571,loadTests1,2020-01-07 02:16:42,1578372402678,2
312571,loadTests2,2020-01-10 04:40:02,1578640202678,2
312571,loadTests3,2020-01-13 07:03:22,1578908002678,2
312571,loadTests1,2020-01-16 09:26:42,1579175802678,2
312571,loadTests2,2020-01-19 11:50:02,1579443602678,2
312571,loadTests3,2020-01-22 14:13:22,1579711402678,2
312571,loadTests1,2020-01-25 16:36:42,1579979202678,2
[INFO] airt.remote_path: S3Path._clean_up(): removing local cache path /tmp/s3test-airt-serviceaccount_312571_events_cached_1qgwyrym
[INFO] airt_service.data.csv: process_csv(datablob_id=15, datasource_id=8): processing user uploaded csv file for datablob_id=15 and uploading parquet back to S3 for datasource_id=8
[INFO] airt_service.data.csv: process_csv(datablob_id=15, datasource_id=8): step 1/4: downloading user uploaded file from bucket s3://ku

In [None]:
with get_session_with_context() as session:
    datablob = session.exec(select(DataBlob).where(DataBlob.id == datablob_id)).one()
    display(datablob)
    delete_data_object_files_in_cloud(data_object=datablob)
    bucket, s3_path = get_s3_bucket_and_path_from_uri(datablob.path)
    objects_in_bucket = list(bucket.objects.filter(Prefix=s3_path + "/"))
    display(f"{objects_in_bucket=}")
    assert len(objects_in_bucket) == 0

    datasource = session.exec(
        select(DataSource).where(DataSource.id == datasource_id)
    ).one()
    display(datasource)
    delete_data_object_files_in_cloud(data_object=datasource)
    bucket, s3_path = get_s3_bucket_and_path_from_uri(datasource.path)
    objects_in_bucket = list(bucket.objects.filter(Prefix=s3_path + "/"))
    display(f"{objects_in_bucket=}")
    assert len(objects_in_bucket) == 0

DataBlob(id=15, uuid=UUID('281b0e1d-4b09-4bc0-8540-b123c9c74d5a'), type='local', uri='s3://kumaran-airt-service-eu-west-1/7/datablob/15', source='tmp/test-folder/', total_steps=1, completed_steps=1, folder_size=28884010, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path='s3://kumaran-airt-service-eu-west-1/7/datablob/15', created=datetime.datetime(2022, 11, 7, 9, 11, 40), user_id=7, pulled_on=None, tags=[Tag(id=1, name='my_csv_datasource_tag', created=datetime.datetime(2022, 11, 7, 9, 9, 59), uuid=UUID('d55a8018-b13c-4244-bbea-09ef8ee1483c')), Tag(id=2, name='latest', created=datetime.datetime(2022, 11, 7, 9, 9, 59), uuid=UUID('487ec313-1f51-498e-ad17-c29a9a10aa25'))])

'objects_in_bucket=[]'

DataSource(id=8, uuid=UUID('c60531e9-eafe-4076-af2d-2467c9d8e82e'), hash='1dd8ee7a0f96a48110dec6e25891d18d', total_steps=1, completed_steps=1, folder_size=6619982, no_of_rows=498961, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path='s3://kumaran-airt-service-eu-west-1/7/datasource/8', created=datetime.datetime(2022, 11, 7, 9, 11, 51), user_id=7, pulled_on=datetime.datetime(2022, 11, 7, 9, 11, 57), tags=[])

'objects_in_bucket=[]'