In [None]:
# | default_exp data.s3

In [None]:
from airt.testing import activate_by_import

[INFO] airt.testing.activate_by_import: Testing environment activated.
[INFO] numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] numexpr.utils: NumExpr defaulting to 8 threads.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
# | export

import shutil
from datetime import datetime
from typing import *

from airt.helpers import get_s3_bucket_name_and_folder_from_uri
from airt.logger import get_logger
from airt.remote_path import RemotePath
from fastcore.script import call_parse
from fastcore.utils import *
from sqlmodel import select

import airt_service.sanitizer
from airt_service.aws.utils import create_s3_datablob_path
from airt_service.azure.utils import create_azure_blob_storage_datablob_path
from airt_service.constants import METADATA_FOLDER_PATH
from airt_service.data.utils import (
    calculate_data_object_folder_size_and_path,
    calculate_data_object_pulled_on,
    get_s3_connection_params_from_db_uri,
)
from airt_service.db.models import DataBlob, PredictionPush, get_session_with_context
from airt_service.helpers import truncate

In [None]:
import json
import os
from datetime import timedelta

import dask.dataframe as dd
import pytest
from fastapi import BackgroundTasks

from airt_service.aws.utils import create_s3_prediction_path
from airt_service.data.utils import create_db_uri_for_s3_datablob
from airt_service.db.models import (
    DataSource,
    User,
    create_user_for_testing,
    get_session,
)
from airt_service.helpers import commit_or_rollback, set_env_variable_context
from airt_service.model.train import TrainRequest, predict_model, train_model

In [None]:
test_username = create_user_for_testing(subscription_type="small")
display(test_username)

'uskrsfrset'

In [None]:
# | exporti

logger = get_logger(__name__)

In [None]:
# | export


def copy_between_s3(
    source_remote_url: str,
    destination_remote_url: str,
    source_access_key: Optional[str] = None,
    source_secret_key: Optional[str] = None,
    destination_access_key: Optional[str] = None,
    destination_secret_key: Optional[str] = None,
    datablob: Optional[DataBlob] = None,
    skip_metadata_dir: Optional[bool] = False,
) -> None:
    """Copy files from source S3 path and to destination S3 path

    By default, all files are copied to the destination_remote_url. In case
    the **skip_metadata_dir** flag is set to **True**, then the **.metadata_by_airt**
    folder will not be copied to the destination_remote_url.

    Args:
        source_remote_url: S3 uri where files to copy are located
        destination_remote_url: S3 uri to copy files
        source_access_key: Source s3 bucket access key
        source_secret_key: Source s3 bucket secret key
        destination_access_key: Destination s3 bucket access key
        destination_secret_key: Destination s3 bucket secret key
        datablob: Optional datablob object to calculate pulled_on field
        skip_metadata_dir: If set to **True** then the **.metadata_by_airt** folder
            will not be copied to the destination_remote_url.
    """
    with RemotePath.from_url(
        remote_url=destination_remote_url,
        pull_on_enter=False,
        push_on_exit=True,
        exist_ok=True,
        parents=True,
        access_key=destination_access_key,
        secret_key=destination_secret_key,
    ) as destionation_s3_path:
        sync_path = destionation_s3_path.as_path()
        with RemotePath.from_url(
            remote_url=source_remote_url,
            pull_on_enter=True,
            push_on_exit=False,
            exist_ok=True,
            parents=False,
            access_key=source_access_key,
            secret_key=source_secret_key,
        ) as source_s3_path:
            if datablob is not None:
                calculate_data_object_pulled_on(datablob)

            source_files = source_s3_path.as_path().iterdir()

            if skip_metadata_dir:
                source_files = [
                    f for f in source_files if METADATA_FOLDER_PATH not in str(f)
                ]

            for f in source_files:
                shutil.move(str(f), sync_path)

        if len(list(sync_path.glob("*"))) == 0:
            raise ValueError(
                f"URI {source_remote_url} is invalid or no files available"
            )

In [None]:
# Test case for skip_metadata_dir=True

with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()

    datablob = DataBlob(
        type="s3",
        uri="",
        source="",
        cloud_provider="aws",
        region="eu-west-1",
        total_steps=1,
        user=user,
    )
    with commit_or_rollback(session):
        session.add(datablob)

    # Creating source bucket
    bucket, s3_path = create_s3_datablob_path(
        user_id=user.id, datablob_id=datablob.id, region=datablob.region
    )
    source_remote_url = f"s3://{bucket.name}/{s3_path}"
    with RemotePath.from_url(
        remote_url=source_remote_url,
        pull_on_enter=False,
        push_on_exit=True,
        exist_ok=True,
        parents=True,
    ) as cache_path:
        processed_cache_path = cache_path.as_path()
        (processed_cache_path / "file-1.parquet").touch()

        metadata_folder_path = processed_cache_path / METADATA_FOLDER_PATH
        metadata_folder_path.mkdir(parents=True, exist_ok=True)

        (metadata_folder_path / "metadata-1.parquet").touch()
        (metadata_folder_path / "metadata-2.parquet").touch()

    # Creating destination bucket
    datablob = DataBlob(
        type="s3",
        uri="",
        source="",
        cloud_provider="aws",
        region="eu-west-3",
        total_steps=1,
        user=user,
    )
    with commit_or_rollback(session):
        session.add(datablob)

    destination_bucket, destination_s3_path = create_s3_datablob_path(
        user_id=user.id, datablob_id=datablob.id, region=datablob.region
    )

    destination_remote_url = f"s3://{destination_bucket.name}/{destination_s3_path}"

    display(f"{source_remote_url=}")
    display(f"{destination_remote_url=}")

    copy_between_s3(
        source_remote_url=source_remote_url,
        destination_remote_url=destination_remote_url,
        skip_metadata_dir=True,
    )

    # Validating the contents of the destination bucket
    with RemotePath.from_url(
        remote_url=destination_remote_url,
        pull_on_enter=True,
        push_on_exit=False,
        exist_ok=True,
        parents=False,
    ) as cache_path:
        files = list(cache_path.as_path().rglob("*.*"))
        assert len(files) == 1, len(files)

[INFO] botocore.credentials: Found credentials in environment variables.
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/82/datablob/7
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-182datablob7_cached_9lt5joja
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/82/datablob/7 locally in /tmp/s3kumaran-airt-service-eu-west-182datablob7_cached_9lt5joja
[INFO] airt.remote_path: S3Path.__exit__(): pushing data from /tmp/s3kumaran-airt-service-eu-west-182datablob7_cached_9lt5joja to s3://kumaran-airt-service-eu-west-1/82/datablob/7
[INFO] airt.remote_path: S3Path._clean_up(): removing local cache path /tmp/s3kumaran-airt-service-eu-west-182datablob7_cached_9lt5joja


"source_remote_url='s3://kumaran-airt-service-eu-west-1/82/datablob/7'"

"destination_remote_url='s3://kumaran-airt-service-eu-west-3/82/datablob/8'"

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-3/82/datablob/8
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-382datablob8_cached_nk8597cm
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-3/82/datablob/8 locally in /tmp/s3kumaran-airt-service-eu-west-382datablob8_cached_nk8597cm
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/82/datablob/7
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-182datablob7_cached_1lrl5t0a
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/82/datablob/7 locally in /tmp/s3kumaran-airt-service-eu-west-182datablob7_cached_1lrl5t0a
[INFO] airt.remote_path: S3Path.__enter_

In [None]:
# Test case for skip_metadata_dir=False

with get_session_with_context() as session:
    datablob = DataBlob(
        type="s3",
        uri="",
        source="",
        cloud_provider="aws",
        region="eu-west-2",
        total_steps=1,
        user=user,
    )
    with commit_or_rollback(session):
        session.add(datablob)

    destination_bucket, destination_s3_path = create_s3_datablob_path(
        user_id=user.id, datablob_id=datablob.id, region=datablob.region
    )

    destination_remote_url = f"s3://{destination_bucket.name}/{destination_s3_path}"

    display(f"{source_remote_url=}")
    display(f"{destination_remote_url=}")

    copy_between_s3(
        source_remote_url=source_remote_url,
        destination_remote_url=destination_remote_url,
        skip_metadata_dir=False,
    )

    # Validating the contents of the destination bucket
    with RemotePath.from_url(
        remote_url=destination_remote_url,
        pull_on_enter=True,
        push_on_exit=False,
        exist_ok=True,
        parents=False,
    ) as cache_path:
        files = list(cache_path.as_path().rglob("*.*"))
        assert len(files) == 4, len(files)

"source_remote_url='s3://kumaran-airt-service-eu-west-1/82/datablob/7'"

"destination_remote_url='s3://kumaran-airt-service-eu-west-2/82/datablob/9'"

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-2/82/datablob/9
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-282datablob9_cached_qqyipeti
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-2/82/datablob/9 locally in /tmp/s3kumaran-airt-service-eu-west-282datablob9_cached_qqyipeti
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/82/datablob/7
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-182datablob7_cached_odkzp4ea
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/82/datablob/7 locally in /tmp/s3kumaran-airt-service-eu-west-182datablob7_cached_odkzp4ea
[INFO] airt.remote_path: S3Path.__enter_

In [None]:
# | export


@call_parse  # type: ignore
def s3_pull(datablob_id: int) -> None:
    """Pull the data from s3 and updates progress in db

    Args:
        datablob_id: Id of datablob in db

    Example:
        The following code executes a CLI command:
        ```s3_pull 1
        ```
    """
    with get_session_with_context() as session:
        datablob = session.exec(
            select(DataBlob).where(DataBlob.id == datablob_id)
        ).one()

        datablob.error = None
        datablob.completed_steps = 0
        datablob.folder_size = None
        datablob.path = None

        (
            uri,
            source_access_key,
            source_secret_key,
        ) = get_s3_connection_params_from_db_uri(db_uri=datablob.uri)

        try:
            source_bucket, folder = get_s3_bucket_name_and_folder_from_uri(uri=uri)
            source_remote_url = f"s3://{source_bucket}/{folder}"

            if datablob.cloud_provider == "aws":
                destination_bucket, s3_path = create_s3_datablob_path(
                    user_id=datablob.user.id,
                    datablob_id=datablob.id,
                    region=datablob.region,
                )
                destination_remote_url = f"s3://{destination_bucket.name}/{s3_path}"
            elif datablob.cloud_provider == "azure":
                (
                    destination_container_client,
                    destination_azure_blob_storage_path,
                ) = create_azure_blob_storage_datablob_path(
                    user_id=datablob.user.id,
                    datablob_id=datablob.id,
                    region=datablob.region,
                )
                destination_remote_url = f"{destination_container_client.url}/{destination_azure_blob_storage_path}"

            with RemotePath.from_url(
                remote_url=destination_remote_url,
                pull_on_enter=False,
                push_on_exit=True,
                exist_ok=True,
                parents=True,
            ) as destionation_remote_path:
                sync_path = destionation_remote_path.as_path()
                with RemotePath.from_url(
                    remote_url=source_remote_url,
                    pull_on_enter=True,
                    push_on_exit=False,
                    exist_ok=True,
                    parents=False,
                    access_key=source_access_key,
                    secret_key=source_secret_key,
                ) as source_s3_path:
                    calculate_data_object_pulled_on(datablob)

                    source_files = source_s3_path.as_path().iterdir()
                    for f in source_files:
                        shutil.move(str(f), sync_path)

                if len(list(sync_path.glob("*"))) == 0:
                    raise ValueError(
                        f"URI {source_remote_url} is invalid or no files available"
                    )

            # Calculate folder size in S3/Azure blob storage
            calculate_data_object_folder_size_and_path(datablob)
        except Exception as e:
            datablob.error = truncate(str(e))

        session.add(datablob)
        session.commit()

In [None]:
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    uri = "s3://test-airt-service/account_312571_events"
    datablob = DataBlob(
        type="s3",
        uri=create_db_uri_for_s3_datablob(
            uri=uri,
            access_key=os.environ["AWS_ACCESS_KEY_ID"],
            secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        ),
        source=uri,
        cloud_provider="aws",
        region="eu-west-3",
        total_steps=1,
        user=user,
    )
    with commit_or_rollback(session):
        session.add(datablob)

    assert not datablob.folder_size
    assert not datablob.path

    s3_pull(datablob_id=datablob.id)
    user_id = user.id
    datablob_id = datablob.id

with get_session_with_context() as session:
    datablob = session.exec(select(DataBlob).where(DataBlob.id == datablob_id)).one()
    display(datablob)
    assert datablob.folder_size == 11219613, datablob.folder_size
    assert (
        datablob.path
        == f"s3://{os.environ['STORAGE_BUCKET_PREFIX']}-eu-west-3/{user_id}/datablob/{datablob.id}"
    ), datablob.path

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-3/82/datablob/10
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-382datablob10_cached_tcopleiq
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-3/82/datablob/10 locally in /tmp/s3kumaran-airt-service-eu-west-382datablob10_cached_tcopleiq
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-serviceaccount_312571_events_cached_r7a_djad
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://test-airt-service/account_312571_events locally in /tmp/s3test-airt-serviceaccount_312571_events_cached_r7a_djad
[INFO] airt.remote_path: S3Path.__enter__(): pulling

DataBlob(id=10, uuid=UUID('bbf6dad0-ea3f-4733-ab94-caec4fc5a465'), type='s3', uri='s3://****************************************@test-airt-service/account_312571_events', source='s3://test-airt-service/account_312571_events', total_steps=1, completed_steps=1, folder_size=11219613, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-3', error=None, disabled=False, path='s3://kumaran-airt-service-eu-west-3/82/datablob/10', created=datetime.datetime(2022, 10, 20, 6, 41, 5), user_id=82, pulled_on=datetime.datetime(2022, 10, 20, 6, 41, 11), tags=[])

In [None]:
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    uri = "s3://test-airt-service/account_312571_events"
    region = "westeurope"
    datablob = DataBlob(
        type="s3",
        uri=create_db_uri_for_s3_datablob(
            uri=uri,
            access_key=os.environ["AWS_ACCESS_KEY_ID"],
            secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        ),
        source=uri,
        cloud_provider="azure",
        region=region,
        total_steps=1,
        user=user,
    )
    with commit_or_rollback(session):
        session.add(datablob)

    assert not datablob.folder_size
    assert not datablob.path

    s3_pull(datablob_id=datablob.id)
    user_id = user.id
    datablob_id = datablob.id

with get_session_with_context() as session:
    datablob = session.exec(select(DataBlob).where(DataBlob.id == datablob_id)).one()
    display(datablob)
    assert datablob.folder_size == 11219613, datablob.folder_size
    assert f"{region}/{user_id}/datablob/{datablob_id}" in datablob.path, datablob.path

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url https://kumsairtsdevwesteurope.blob.core.windows.net/kumsairtsdevwesteurope/82/datablob/12
[INFO] airt.remote_path: AzureBlobPath._create_cache_path(): created cache path: /tmp/httpskumsairtsdevwesteuropeblobcorewindowsnetkumsairtsdevwesteurope82datablob12_cached_w144flnx
[INFO] airt.remote_path: 

DataBlob(id=12, uuid=UUID('33e979e6-60b5-41e6-abb1-6adb1b553dea'), type='s3', uri='s3://****************************************@test-airt-service/account_312571_events', source='s3://test-airt-service/account_312571_events', total_steps=1, completed_steps=1, folder_size=11219613, cloud_provider=<CloudProvider.azure: 'azure'>, region='westeurope', error=None, disabled=False, path='https://kumsairtsdevwesteurope.blob.core.windows.net/kumsairtsdevwesteurope/82/datablob/12', created=datetime.datetime(2022, 10, 20, 6, 41, 26), user_id=82, pulled_on=datetime.datetime(2022, 10, 20, 6, 41, 37), tags=[])

In [None]:
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    uri = "s3://test-airt-service/folder_does_not_exists"
    datablob = DataBlob(
        type="s3",
        uri=create_db_uri_for_s3_datablob(
            uri=uri,
            access_key=os.environ["AWS_ACCESS_KEY_ID"],
            secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        ),
        source=uri,
        cloud_provider="aws",
        region="eu-west-1",
        total_steps=1,
        user=user,
    )
    with commit_or_rollback(session):
        session.add(datablob)

    assert not datablob.folder_size
    assert not datablob.path

    s3_pull(datablob_id=datablob.id)

with get_session_with_context() as session:
    datablob = session.exec(select(DataBlob).where(DataBlob.id == datablob.id)).one()
    display(datablob)
    assert f"URI {uri} is invalid or no files available" in datablob.error
    assert not datablob.folder_size
    assert not datablob.path

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/82/datablob/14
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-182datablob14_cached_lpa48fyz
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/82/datablob/14 locally in /tmp/s3kumaran-airt-service-eu-west-182datablob14_cached_lpa48fyz
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/folder_does_not_exists
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-servicefolder_does_not_exists_cached_b9utql4i
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://test-airt-service/folder_does_not_exists locally in /tmp/s3test-airt-servicefolder_does_not_exists_cached_b9utql4i
[INFO] airt.remote_path: S3Path.__enter__(): pul

DataBlob(id=14, uuid=UUID('1fb3b435-82cd-4bd8-8d8d-5e9b3e2c76d6'), type='s3', uri='s3://****************************************@test-airt-service/folder_does_not_exists', source='s3://test-airt-service/folder_does_not_exists', total_steps=1, completed_steps=0, folder_size=None, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error='URI s3://test-airt-service/folder_does_not_exists is invalid or no files available', disabled=False, path=None, created=datetime.datetime(2022, 10, 20, 6, 42, 15), user_id=82, pulled_on=datetime.datetime(2022, 10, 20, 6, 42, 17), tags=[])

In [None]:
# | export


@call_parse  # type: ignore
def s3_push(prediction_push_id: int) -> None:
    """Push the data from s3 and update its progress in db

    Args:
        prediction_push_id: Id of prediction_push

    Example:
        The following code executes a CLI command:
        ```s3_push 1
        ```
    """
    with get_session_with_context() as session:
        prediction_push = session.exec(
            select(PredictionPush).where(PredictionPush.id == prediction_push_id)
        ).one()

        prediction_push.error = None
        prediction_push.completed_steps = 0

        (
            uri,
            destination_access_key,
            destination_secret_key,
        ) = get_s3_connection_params_from_db_uri(db_uri=prediction_push.uri)

        try:
            (
                destination_bucket,
                destination_s3_path,
            ) = get_s3_bucket_name_and_folder_from_uri(uri=uri)
            source_remote_url = prediction_push.prediction.path
            destination_remote_url = f"s3://{destination_bucket}/{destination_s3_path}"

            with RemotePath.from_url(
                remote_url=destination_remote_url,
                pull_on_enter=False,
                push_on_exit=True,
                exist_ok=True,
                parents=True,
                access_key=destination_access_key,
                secret_key=destination_secret_key,
            ) as destionation_s3_path:
                sync_path = destionation_s3_path.as_path()
                with RemotePath.from_url(
                    remote_url=source_remote_url,
                    pull_on_enter=True,
                    push_on_exit=False,
                    exist_ok=True,
                    parents=False,
                ) as source_remote_path:
                    source_files = source_remote_path.as_path().iterdir()
                    for f in source_files:
                        shutil.move(str(f), sync_path)

                if len(list(sync_path.glob("*"))) == 0:
                    raise ValueError(
                        f"URI {source_remote_url} is invalid or no files available"
                    )

            prediction_push.completed_steps = 1
        except Exception as e:
            prediction_push.error = truncate(str(e))

        session.add(prediction_push)
        session.commit()

In [None]:
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()

    with commit_or_rollback(session):
        display(f"{datablob=}")
        datasource = DataSource(
            datablob_id=datablob.id,
            cloud_provider=datablob.cloud_provider,
            region=datablob.region,
            total_steps=1,
            user=user,
        )

    train_request = TrainRequest(
        data_uuid=datasource.uuid,
        client_column="AccountId",
        target_column="DefinitionId",
        target="load*",
        predict_after=timedelta(seconds=20 * 24 * 60 * 60),
    )

    model = train_model(train_request=train_request, user=user, session=session)

    b = BackgroundTasks()
    with set_env_variable_context(variable="JOB_EXECUTOR", value="fastapi"):
        prediction = predict_model(
            model_uuid=model.uuid, user=user, session=session, background_tasks=b
        )
    display(prediction)

    bucket, s3_path = create_s3_prediction_path(
        user_id=user.id, prediction_id=prediction.id, region=prediction.region
    )
    copy_between_s3(
        source_remote_url=r"s3://test-airt-service/account_312571_events",
        destination_remote_url=f"s3://{bucket.name}/{s3_path}",
    )

    with commit_or_rollback(session):
        prediction.path = f"s3://{bucket.name}/{s3_path}"
        session.add(prediction)

    prediction_push = PredictionPush(
        total_steps=1,
        prediction_id=prediction.id,
        uri=create_db_uri_for_s3_datablob(
            uri=f"s3://{bucket.name}/{s3_path}",
            access_key=os.environ["AWS_ACCESS_KEY_ID"],
            secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        ),
    )
    session.add(prediction_push)
    session.commit()
    display(prediction_push)

    assert prediction_push.completed_steps == 0

    s3_push(prediction_push_id=prediction_push.id)

with get_session_with_context() as session:
    prediction_push = session.exec(
        select(PredictionPush).where(PredictionPush.id == prediction_push.id)
    ).one()
    display(prediction_push)
    assert prediction_push.completed_steps == prediction_push.total_steps

"datablob=DataBlob(id=14, uuid=UUID('1fb3b435-82cd-4bd8-8d8d-5e9b3e2c76d6'), type='s3', uri='s3://****************************************@test-airt-service/folder_does_not_exists', source='s3://test-airt-service/folder_does_not_exists', total_steps=1, completed_steps=0, folder_size=None, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error='URI s3://test-airt-service/folder_does_not_exists is invalid or no files available', disabled=False, path=None, created=datetime.datetime(2022, 10, 20, 6, 42, 15), user_id=82, pulled_on=datetime.datetime(2022, 10, 20, 6, 42, 17), tags=[])"

[INFO] airt_service.batch_job: create_batch_job(): command='predict 9', task='csv_processing'
[INFO] airt_service.batch_job_components.base: Entering FastAPIBatchJobContext(task=csv_processing)
[INFO] airt_service.batch_job: batch_ctx=FastAPIBatchJobContext(task=csv_processing)
[INFO] airt_service.batch_job_components.fastapi: FastAPIBatchJobContext.create_job(self=FastAPIBatchJobContext(task=csv_processing), command='predict 9', environment_vars={'AWS_ACCESS_KEY_ID': '********************', 'AWS_SECRET_ACCESS_KEY': '****************************************', 'AWS_DEFAULT_REGION': 'eu-west-1', 'AZURE_SUBSCRIPTION_ID': '************************************', 'AZURE_TENANT_ID': '************************************', 'AZURE_CLIENT_ID': '************************************', 'AZURE_CLIENT_SECRET': '****************************************', 'AZURE_STORAGE_ACCOUNT_PREFIX': 'kumsairtsdev', 'AZURE_RESOURCE_GROUP': 'kumaran-airt-service-dev', 'STORAGE_BUCKET_PREFIX': 'kumaran-airt-service', 

Prediction(total_steps=3, created=datetime.datetime(2022, 10, 20, 6, 42, 18), error=None, uuid=UUID('bd92f179-0baa-4304-b6d4-a2fe545e4456'), disabled=False, datasource_id=6, id=9, model_id=8, path=None, cloud_provider=<CloudProvider.aws: 'aws'>, completed_steps=0, region='eu-west-1')

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/82/prediction/9
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-182prediction9_cached_h5e39q4f
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/82/prediction/9 locally in /tmp/s3kumaran-airt-service-eu-west-182prediction9_cached_h5e39q4f
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-serviceaccount_312571_events_cached_h7avz68b
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://test-airt-service/account_312571_events locally in /tmp/s3test-airt-serviceaccount_312571_events_cached_h7avz68b
[INFO] airt.remote_path: S3Path.__enter__(): pul

PredictionPush(id=5, uuid=UUID('3b08e353-8917-4669-9920-8091afca616d'), uri='s3://****************************************@kumaran-airt-service-eu-west-1/82/prediction/9', total_steps=1, completed_steps=0, error=None, created=datetime.datetime(2022, 10, 20, 6, 42, 32), prediction_id=9, )

[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/82/prediction/9
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-182prediction9_cached__c7pghx_
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/82/prediction/9 locally in /tmp/s3kumaran-airt-service-eu-west-182prediction9_cached__c7pghx_
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://kumaran-airt-service-eu-west-1/82/prediction/9
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3kumaran-airt-service-eu-west-182prediction9_cached_b15zl0mp
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://kumaran-airt-service-eu-west-1/82/prediction/9 locally in /tmp/s3kumaran-airt-service-eu-west-182prediction9_cached_b15zl0mp
[INFO] airt.remote_path:

PredictionPush(id=5, uuid=UUID('3b08e353-8917-4669-9920-8091afca616d'), uri='s3://****************************************@kumaran-airt-service-eu-west-1/82/prediction/9', total_steps=1, completed_steps=1, error=None, created=datetime.datetime(2022, 10, 20, 6, 42, 32), prediction_id=9, )