In [None]:
#| default_exp aws.utils

In [None]:
from airt.testing import activate_by_import

[INFO] numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] numexpr.utils: NumExpr defaulting to 8 threads.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
#| export

import os
import yaml
from pathlib import Path
from typing import *

import boto3
import requests
from fastapi import status, HTTPException
from mypy_boto3_s3.service_resource import Bucket


from airt.helpers import get_s3_bucket_name_and_folder_from_uri
from airt.logger import get_logger

In [None]:
import tempfile
from time import sleep

import dask.dataframe as dd
import pytest
from sqlmodel import select

from airt.remote_path import RemotePath
from airt_service.data.datablob import FromLocalRequest, from_local_start_route
from airt_service.db.models import (
    create_user_for_testing,
    get_session_with_context,
    User,
)

[INFO] airt.executor.subcommand: Module loaded.


In [None]:
test_username = create_user_for_testing()
display(test_username)

'ieaggdcklq'

In [None]:
#| exporti

logger = get_logger(__name__)

In [None]:
#| export


def get_available_aws_regions() -> List[str]:
    """Get supported regions

    Returns:
        List of supported regions
    """

    # boto3.session.Session().get_available_regions('s3') is the api to get available regions of an aws service
    # batch supports one less region than s3 so hardcoding the following list of regions
    return [
        #         "af-south-1", # Africa capetown
        #         "ap-east-1", # Asia Pasific HongKong
        "ap-northeast-1",
        "ap-northeast-2",
        #         "ap-northeast-3", # Problem with creating gpu instances for training during build_wheel stage
        "ap-south-1",
        "ap-southeast-1",
        "ap-southeast-2",
        "ca-central-1",
        "eu-central-1",
        "eu-north-1",
        #         "eu-south-1", # Europe Milan
        "eu-west-1",
        "eu-west-2",
        "eu-west-3",
        #         "me-south-1", # Middle East Bahrain
        "sa-east-1",
        "us-east-1",
        "us-east-2",
        "us-west-1",
        "us-west-2",
    ]

In [None]:
get_available_aws_regions()

['ap-northeast-1',
 'ap-northeast-2',
 'ap-south-1',
 'ap-southeast-1',
 'ap-southeast-2',
 'ca-central-1',
 'eu-central-1',
 'eu-north-1',
 'eu-west-1',
 'eu-west-2',
 'eu-west-3',
 'sa-east-1',
 'us-east-1',
 'us-east-2',
 'us-west-1',
 'us-west-2']

In [None]:
#| export


def verify_aws_region(region: str):
    """
    Verify region is in available regions else raise an error

    Args:
        region: region name
    Raises:
        HTTPException: If region is not a valid region
    """
    available_regions = get_available_aws_regions()
    if region not in available_regions:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=f"Unknown region - {region}; Available regions are {', '.join(available_regions)}",
        )

In [None]:
verify_aws_region("eu-west-1")

with pytest.raises(HTTPException) as e:
    verify_aws_region(region="region-doesnt-exists")
assert "Unknown region" in str(e)
display(e)

<ExceptionInfo HTTPException(status_code=400, detail='Unknown region - region-doesnt-exists; Available regions are ap-northeast-1, ap...l-1, eu-central-1, eu-north-1, eu-west-1, eu-west-2, eu-west-3, sa-east-1, us-east-1, us-east-2, us-west-1, us-west-2') tblen=2>

In [None]:
#| export


def get_s3_storage_bucket(region: str = "eu-west-1") -> Tuple[Bucket, str]:
    """Get the root s3 bucket to store datasources, models, predictions

    Args:
        region: region name
    Returns:
        The root storage s3 bucket
    Raises:
        HTTPException: If region is not a valid region
    """
    verify_aws_region(region)

    storage_bucket = f"s3://{os.environ['STORAGE_BUCKET_PREFIX']}-{region}"
    bucket_name, base_path = get_s3_bucket_name_and_folder_from_uri(storage_bucket)

    s3 = boto3.resource("s3")
    bucket = s3.Bucket(bucket_name)

    if not bucket.creation_date:
        s3_client = boto3.client("s3", region_name=region)
        #         region = s3_client.meta.region_name
        try:
            s3_client.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration={"LocationConstraint": region},
            )
        except s3_client.exceptions.BucketAlreadyOwnedByYou as e:
            logger.info("Bucket already created")
        bucket = s3.Bucket(bucket_name)
    return bucket, base_path

In [None]:
actual = get_s3_storage_bucket(region="eu-west-1")
display(actual)
assert actual

[INFO] botocore.credentials: Found credentials in environment variables.


(s3.Bucket(name='kumaran-airt-service-eu-west-1'), '')

In [None]:
actual = get_s3_storage_bucket(region="eu-west-3")
display(actual)
assert actual

(s3.Bucket(name='kumaran-airt-service-eu-west-3'), '')

In [None]:
with pytest.raises(HTTPException) as e:
    get_s3_storage_bucket(region="region-doesnt-exists")
assert "Unknown region" in str(e)
display(e)

<ExceptionInfo HTTPException(status_code=400, detail='Unknown region - region-doesnt-exists; Available regions are ap-northeast-1, ap...l-1, eu-central-1, eu-north-1, eu-west-1, eu-west-2, eu-west-3, sa-east-1, us-east-1, us-east-2, us-west-1, us-west-2') tblen=3>

In [None]:
#| export


def create_s3_datablob_path(
    user_id: int, datablob_id: int, region: str
) -> Tuple[Bucket, str]:
    """Create an S3 path to store the datablobs

    Args:
        user_id: User id
        datablob_id: Datablob id

    Returns:
        The root storage bucket object and the s3 path as a tuple
    """
    bucket, base_path = get_s3_storage_bucket(region=region)
    s3_path = f"{user_id}/datablob/{datablob_id}"
    s3_path = f"{base_path}/{s3_path}" if base_path else s3_path

    return bucket, s3_path

In [None]:
actual = create_s3_datablob_path(user_id=999, datablob_id=999, region="eu-west-1")
display(actual)
expected = "999/datablob/999"
_, base_path = get_s3_storage_bucket(region="eu-west-1")
expected = f"{base_path}/{expected}" if base_path else expected

assert actual[0]
assert actual[1] == expected

# bucket = actual[0]
# display(bucket.meta.client.head_object(Bucket=bucket.name, Key="999/datasource/999/result.json"))

(s3.Bucket(name='kumaran-airt-service-eu-west-1'), '999/datablob/999')

In [None]:
#| export


def create_s3_datasource_path(
    user_id: int, datasource_id: int, region: str
) -> Tuple[Bucket, str]:
    """Create an S3 path to store the datasources

    Args:
        user_id: User id
        datasource_id: Datasource id to store

    Returns:
        The root storage bucket object and the s3 path as a tuple
    """
    bucket, base_path = get_s3_storage_bucket(region=region)
    s3_path = f"{user_id}/datasource/{datasource_id}"
    s3_path = f"{base_path}/{s3_path}" if base_path else s3_path

    return bucket, s3_path

In [None]:
actual = create_s3_datasource_path(user_id=999, datasource_id=999, region="eu-west-1")
display(actual)
expected = "999/datasource/999"
_, base_path = get_s3_storage_bucket(region="eu-west-1")
expected = f"{base_path}/{expected}" if base_path else expected


assert actual[0]
assert actual[1] == expected

# bucket = actual[0]
# display(bucket.meta.client.head_object(Bucket=bucket.name, Key="999/datasource/999/result.json"))

(s3.Bucket(name='kumaran-airt-service-eu-west-1'), '999/datasource/999')

In [None]:
#| export


def create_s3_prediction_path(
    user_id: int, prediction_id: int, region: str
) -> Tuple[Bucket, str]:
    """Create an S3 path to store the prediction results

    Args:
        user_id: User id
        prediction_id: Prediction id

    Returns:
        The root storage bucket object and the s3 path as a tuple
    """
    bucket, base_path = get_s3_storage_bucket(region=region)
    s3_path = f"{user_id}/prediction/{prediction_id}"
    s3_path = f"{base_path}/{s3_path}" if base_path else s3_path

    return bucket, s3_path

In [None]:
actual = create_s3_prediction_path(user_id=999, prediction_id=999, region="eu-west-1")
display(actual)
expected = "999/prediction/999"
_, base_path = get_s3_storage_bucket(region="eu-west-1")
expected = f"{base_path}/{expected}" if base_path else expected

assert actual[0]
assert actual[1] == expected

(s3.Bucket(name='kumaran-airt-service-eu-west-1'), '999/prediction/999')

In [None]:
#| export


def get_batch_environment_arns(
    region: str, batch_environment_arn_path: Optional[Union[str, Path]] = None
) -> Dict[str, Dict[str, str]]:
    """Read the batch environment arn yaml file and return as a dict

    Args:
        region: Region to get batch environment arns
        batch_environment_arn_path: Path to the arn file. If not set, then the batch_environment
            will be loaded from the current working directory

    Returns:
        The created batch environment arns as a dict
    """
    if batch_environment_arn_path is None:
        batch_environment_arn_path = Path("./batch_environment.yml")
    with open(batch_environment_arn_path) as f:
        batch_environment_arns = yaml.safe_load(f)

    return batch_environment_arns[region]

In [None]:
region = "eu-west-1"
test_batch_environment_arns = {
    region: {
        task: {
            arn: "arn:aws:batch:placeholder"
            for arn in [
                "compute_environment_arn",
                "job_definition_arn",
                "job_queue_arn",
            ]
        }
        for task in ["csv_processing", "predictions", "preprocessing", "training"]
    }
}

with tempfile.TemporaryDirectory() as td:
    td = Path(td)
    test_batch_environment_arn_path = td / "batch_environment.yml"
    with open(test_batch_environment_arn_path, "w") as f:
        yaml.dump(test_batch_environment_arns, f, default_flow_style=False)
    actual = get_batch_environment_arns(
        region=region, batch_environment_arn_path=test_batch_environment_arn_path
    )
    display(actual)
    assert actual == test_batch_environment_arns[region]

{'csv_processing': {'compute_environment_arn': 'arn:aws:batch:placeholder',
  'job_definition_arn': 'arn:aws:batch:placeholder',
  'job_queue_arn': 'arn:aws:batch:placeholder'},
 'predictions': {'compute_environment_arn': 'arn:aws:batch:placeholder',
  'job_definition_arn': 'arn:aws:batch:placeholder',
  'job_queue_arn': 'arn:aws:batch:placeholder'},
 'preprocessing': {'compute_environment_arn': 'arn:aws:batch:placeholder',
  'job_definition_arn': 'arn:aws:batch:placeholder',
  'job_queue_arn': 'arn:aws:batch:placeholder'},
 'training': {'compute_environment_arn': 'arn:aws:batch:placeholder',
  'job_definition_arn': 'arn:aws:batch:placeholder',
  'job_queue_arn': 'arn:aws:batch:placeholder'}}

In [None]:
#| export


def get_queue_definition_arns(
    task: str,
    region: str,
    batch_environment_arn_path: Optional[Union[str, Path]] = None,
) -> Tuple[str, str]:
    """Get the job queue arn and the job definition arn for the given task

    Args:
        task: Task name
        region: Region to get queue definition arns
        batch_environment_arn_path: Path to the arn file. If not set, then the batch_environment
            will be loaded from the current working directory
    """
    batch_environment_arns = get_batch_environment_arns(
        region=region, batch_environment_arn_path=batch_environment_arn_path
    )
    job_queue_arn = batch_environment_arns[task]["job_queue_arn"]
    job_definition_arn = batch_environment_arns[task]["job_definition_arn"]
    return job_queue_arn, job_definition_arn

In [None]:
region = "eu-west-1"
with tempfile.TemporaryDirectory() as td:
    td = Path(td)
    test_batch_environment_arn_path = td / "batch_environment.yml"
    with open(test_batch_environment_arn_path, "w") as f:
        yaml.dump(test_batch_environment_arns, f, default_flow_style=False)

    task = "csv_processing"
    actual_job_queue_arn, actual_job_definition_arn = get_queue_definition_arns(
        task=task,
        region=region,
        batch_environment_arn_path=test_batch_environment_arn_path,
    )
    assert (
        actual_job_queue_arn
        == test_batch_environment_arns[region][task]["job_queue_arn"]
    )
    assert (
        actual_job_definition_arn
        == test_batch_environment_arns[region][task]["job_definition_arn"]
    )

In [None]:
#| export


def upload_to_s3_with_retry(
    file_to_upload: str,
    presigned_url: str,
    presigned_fields: Dict[str, Any],
    max_retry: int = 3,
    curr_iteration: int = 1,
):
    """
    Helper function to upload local files to s3 using presigned url; Used only in tests

    Args:
        file_to_upload: path of file to upload
        presigned_url: presigned url to upload to
        presigned_fields: presigned fields provided by boto3
        max_retry: maximum retry count
        curr_iteration: current iteration count for internal use
    """
    try:
        with open(file_to_upload, "rb") as f:
            files = {"file": (str(file_to_upload), f)}
            response = requests.post(presigned_url, data=presigned_fields, files=files)
            assert response.status_code == 204, response.text  # nosec B101
    except requests.exceptions.ConnectionError as e:
        print("Retrying upload")
        if curr_iteration == max_retry:
            print("Retry failed")
            raise e
        upload_to_s3_with_retry(
            file_to_upload,
            presigned_url,
            presigned_fields,
            max_retry,
            curr_iteration + 1,
        )

In [None]:
# Create a csv datasource and upload multiple csv files using presigned url
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    from_local_request = FromLocalRequest(
        path="tmp/test-folder/", tag="my_csv_datasource_tag"
    )
    from_local_response = from_local_start_route(
        from_local_request=from_local_request,
        user=user,
        session=session,
    )

    with RemotePath.from_url(
        remote_url=f"s3://test-airt-service/account_312571_events",
        pull_on_enter=True,
        push_on_exit=False,
        exist_ok=True,
        parents=False,
        access_key=os.environ["AWS_ACCESS_KEY_ID"],
        secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    ) as test_s3_path:
        ddf = dd.read_parquet(test_s3_path.as_path())
        display(ddf.head())
        ddf.to_csv(test_s3_path.as_path() / "csv" / "file-*.csv", index=False)
        display(list((test_s3_path.as_path() / "csv").glob("*")))
        !head -n 10 {test_s3_path.as_path()/"csv"/"file-0.csv"}
        sleep(10)

        for csv_to_upload in sorted((test_s3_path.as_path() / "csv").glob("*.csv")):
            display(f"Uploading {csv_to_upload}")
            upload_to_s3_with_retry(
                csv_to_upload,
                from_local_response.presigned["url"],
                from_local_response.presigned["fields"],
            )

  results = super().execute(


[INFO] airt_service.data.datablob: DataBlob.from_local(): FromLocalResponse(uuid=UUID('b8769687-9aa3-4186-85fa-28a930f1eba7'), type='local', presigned={'url': 'https://kumaran-airt-service-eu-west-1.s3.amazonaws.com/', 'fields': {'key': '9/datablob/8/${filename}', 'AWSAccessKeyId': 'AKIAY7RRHQ4BEOUZVSE3', 'policy': 'eyJleHBpcmF0aW9uIjogIjIwMjItMDktMDNUMDY6Mzk6NDBaIiwgImNvbmRpdGlvbnMiOiBbWyJzdGFydHMtd2l0aCIsICIka2V5IiwgIjkvZGF0YWJsb2IvOCJdLCB7ImJ1Y2tldCI6ICJrdW1hcmFuLWFpcnQtc2VydmljZS1ldS13ZXN0LTEifSwgWyJzdGFydHMtd2l0aCIsICIka2V5IiwgIjkvZGF0YWJsb2IvOC8iXV19', 'signature': '2kLPQLr0YsHsBBhzGtd8nD6EfVw='}})
[INFO] airt.remote_path: RemotePath.from_url(): creating remote path with the following url s3://test-airt-service/account_312571_events
[INFO] airt.remote_path: S3Path._create_cache_path(): created cache path: /tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x
[INFO] airt.remote_path: S3Path.__init__(): created object for accessing s3://test-airt-service/account_312571_even

Unnamed: 0,AccountId,DefinitionId,OccurredTime,OccurredTimeTicks,PersonId
0,312571,loadTests2,2019-12-31 21:30:02,1577836802678,2
1,312571,loadTests3,2020-01-03 23:53:22,1578104602678,2
2,312571,loadTests1,2020-01-07 02:16:42,1578372402678,2
3,312571,loadTests2,2020-01-10 04:40:02,1578640202678,2
4,312571,loadTests3,2020-01-13 07:03:22,1578908002678,2


[Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-4.csv'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-3.csv'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-2.csv'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-0.csv'),
 Path('/tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-1.csv')]

AccountId,DefinitionId,OccurredTime,OccurredTimeTicks,PersonId
312571,loadTests2,2019-12-31 21:30:02,1577836802678,2
312571,loadTests3,2020-01-03 23:53:22,1578104602678,2
312571,loadTests1,2020-01-07 02:16:42,1578372402678,2
312571,loadTests2,2020-01-10 04:40:02,1578640202678,2
312571,loadTests3,2020-01-13 07:03:22,1578908002678,2
312571,loadTests1,2020-01-16 09:26:42,1579175802678,2
312571,loadTests2,2020-01-19 11:50:02,1579443602678,2
312571,loadTests3,2020-01-22 14:13:22,1579711402678,2
312571,loadTests1,2020-01-25 16:36:42,1579979202678,2


'Uploading /tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-0.csv'

'Uploading /tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-1.csv'

'Uploading /tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-2.csv'

'Uploading /tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-3.csv'

'Uploading /tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x/csv/file-4.csv'

[INFO] airt.remote_path: S3Path._clean_up(): removing local cache path /tmp/s3test-airt-serviceaccount_312571_events_cached_mkwk1c3x


In [None]:
#| export


def get_s3_bucket_and_path_from_uri(uri: Union[str, Path]) -> Tuple[Bucket, str]:
    """Get bucket object and s3 path from s3 uri

    Args:
        uri: full s3 uri

    Returns:
        The bucket object and the s3 path as a tuple
    """
    s3 = boto3.resource("s3")
    bucket_name, s3_path = get_s3_bucket_name_and_folder_from_uri(str(uri))
    bucket = s3.Bucket(bucket_name)
    return bucket, s3_path

In [None]:
actual = get_s3_bucket_and_path_from_uri(
    uri="s3://test-airt-service/account_312571_events"
)
assert actual[0]
assert actual[0].name == "test-airt-service"
assert actual[1] == "account_312571_events"