In [None]:
#| default_exp azure.utils

In [None]:
from airt.testing import activate_by_import

[INFO] airt.testing.activate_by_import: Testing environment activated.
[INFO] numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] numexpr.utils: NumExpr defaulting to 8 threads.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
#| export

import logging
import os
import yaml
from pathlib import Path
from typing import *

from azure.identity import DefaultAzureCredential
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.storage import StorageManagementClient
from azure.storage.blob import BlobServiceClient
from azure.storage.blob._container_client import ContainerClient
from fastapi import status, HTTPException

from airt.helpers import get_s3_bucket_name_and_folder_from_uri
from airt.logger import get_logger

In [None]:
import tempfile

import pytest

from airt_service.db.models import create_user_for_testing

In [None]:
test_username = create_user_for_testing()
display(test_username)

'nyupzlpfre'

In [None]:
#| exporti

logger = get_logger(__name__)

In [None]:
#| exporti

# This is needed to disable excessive logging from azure-storage-blob library

(logging.getLogger("azure.core.pipeline.policies.http_logging_policy")).setLevel(
    logging.WARNING
)

In [None]:
#| export


def get_available_azure_regions() -> List[str]:
    """Get supported azure regions

    Returns:
        List of supported azure regions
    """

    # Hardcoded list from https://stackoverflow.com/a/61263190/3664629
    # ToDo: retrieve programmatically and replace
    return [
        "australiacentral",
        "australiacentral2",
        "australiaeast",
        "australiasoutheast",
        "brazilsouth",
        "canadacentral",
        "canadaeast",
        "centralindia",
        "centralus",
        "eastasia",
        "eastus",
        "eastus2",
        "francecentral",
        "francesouth",
        "germanynorth",
        "germanywestcentral",
        "japaneast",
        "japanwest",
        "koreacentral",
        "koreasouth",
        "northcentralus",
        "northeurope",
        "norwayeast",
        "norwaywest",
        "southafricanorth",
        "southafricawest",
        "southcentralus",
        "southeastasia",
        "southindia",
        "switzerlandnorth",
        "switzerlandwest",
        "uaecentral",
        "uaenorth",
        "uksouth",
        "ukwest",
        "westcentralus",
        "westeurope",
        "westindia",
        "westus",
        "westus2",
    ]

In [None]:
get_available_azure_regions()

['australiacentral',
 'australiacentral2',
 'australiaeast',
 'australiasoutheast',
 'brazilsouth',
 'canadacentral',
 'canadaeast',
 'centralindia',
 'centralus',
 'eastasia',
 'eastus',
 'eastus2',
 'francecentral',
 'francesouth',
 'germanynorth',
 'germanywestcentral',
 'japaneast',
 'japanwest',
 'koreacentral',
 'koreasouth',
 'northcentralus',
 'northeurope',
 'norwayeast',
 'norwaywest',
 'southafricanorth',
 'southafricawest',
 'southcentralus',
 'southeastasia',
 'southindia',
 'switzerlandnorth',
 'switzerlandwest',
 'uaecentral',
 'uaenorth',
 'uksouth',
 'ukwest',
 'westcentralus',
 'westeurope',
 'westindia',
 'westus',
 'westus2']

In [None]:
#| export


def verify_azure_region(region: str):
    """
    Verify given region is in available azure regions else raise an error

    Args:
        region: region name
    Raises:
        HTTPException: If region is not a valid region
    """
    available_regions = get_available_azure_regions()
    if region not in available_regions:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=f"Unknown region - {region}; Available regions are {', '.join(available_regions)}",
        )

In [None]:
verify_azure_region("westeurope")

with pytest.raises(HTTPException) as e:
    verify_azure_region(region="region-doesnt-exists")
assert "Unknown region" in str(e)
display(e)

<ExceptionInfo HTTPException(status_code=400, detail='Unknown region - region-doesnt-exists; Available regions are australiacentral, ...dnorth, switzerlandwest, uaecentral, uaenorth, uksouth, ukwest, westcentralus, westeurope, westindia, westus, westus2') tblen=2>

In [None]:
#| export


def create_azure_resource_group_storage_account_and_container(
    resource_group_region: str = "westeurope",
    *,
    storage_account_region: str,
) -> str:
    """
    Create azure resource group and storage account

    Args:
        resource_group_region: region of resource group
        storage_account_region: region of storage account
    Returns:
        Created storage account's name
    """
    credential = DefaultAzureCredential()
    subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"]

    resource_group = os.environ["AZURE_RESOURCE_GROUP"]
    resource_client = ResourceManagementClient(credential, subscription_id)
    rg_result = resource_client.resource_groups.create_or_update(
        resource_group, {"location": resource_group_region}
    )

    storage_client = StorageManagementClient(credential, subscription_id)
    storage_account_name = (
        f"{os.environ['AZURE_STORAGE_ACCOUNT_PREFIX']}{storage_account_region}"[-24:]
    )
    availability_result = storage_client.storage_accounts.check_name_availability(
        {"name": storage_account_name}
    )

    if availability_result.name_available:
        poller = storage_client.storage_accounts.begin_create(
            resource_group,
            storage_account_name,
            {
                "location": storage_account_region,
                "kind": "StorageV2",
                "sku": {"name": "Standard_LRS"},
            },
        )

        # Long-running operations return a poller object; calling poller.result()
        # waits for completion.
        account_result = poller.result()

    # Container name is same as storage account name
    container = storage_client.blob_containers.create(
        resource_group, storage_account_name, storage_account_name, {}
    )
    return storage_account_name

In [None]:
actual = create_azure_resource_group_storage_account_and_container(
    storage_account_region="westeurope"
)
display(actual)
assert actual

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential


'kumsairtsdevwesteurope'

In [None]:
#| export


def get_azure_blob_storage_container(
    region: str = "westeurope",
) -> Tuple[ContainerClient, str]:
    """Get the root azure blob storage container to store datasources, models, predictions

    Args:
        region: region name
    Returns:
        The root storage azure blob storage container
    Raises:
        HTTPException: If region is not a valid region
    """
    verify_azure_region(region)

    storage_account_name = create_azure_resource_group_storage_account_and_container(
        storage_account_region=region,
    )

    storage_container_path = (
        f"https://{storage_account_name}.blob.core.windows.net/{storage_account_name}"
    )

    storage_account, base_path = get_s3_bucket_name_and_folder_from_uri(
        storage_container_path
    )
    container_name = base_path.split("/")[0]
    base_path = "/".join(base_path.split("/")[1:])

    blob_service_client = BlobServiceClient(
        account_url=f"https://{storage_account}",
        credential=DefaultAzureCredential(),
    )
    container_client = blob_service_client.get_container_client(
        container=container_name
    )

    return container_client, base_path

In [None]:
actual = get_azure_blob_storage_container(region="westeurope")
display(actual)
display(actual[0].url)
assert actual

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS


(<azure.storage.blob._container_client.ContainerClient at 0x7f6cfc425d30>, '')

'https://kumsairtsdevwesteurope.blob.core.windows.net/kumsairtsdevwesteurope'

In [None]:
with pytest.raises(HTTPException) as e:
    get_azure_blob_storage_container(region="region-doesnt-exists")
assert "Unknown region" in str(e)
display(e)

<ExceptionInfo HTTPException(status_code=400, detail='Unknown region - region-doesnt-exists; Available regions are australiacentral, ...dnorth, switzerlandwest, uaecentral, uaenorth, uksouth, ukwest, westcentralus, westeurope, westindia, westus, westus2') tblen=3>

In [None]:
#| export


def create_azure_blob_storage_datablob_path(
    user_id: int, datablob_id: int, region: str
) -> Tuple[ContainerClient, str]:
    """Create an S3 path to store the datablobs

    Args:
        user_id: User id
        datablob_id: Datablob id

    Returns:
        The root storage bucket object and the s3 path as a tuple
    """
    container_client, base_path = get_azure_blob_storage_container(region=region)
    azure_blob_storage_path = f"{user_id}/datablob/{datablob_id}"
    azure_blob_storage_path = (
        f"{base_path}/{azure_blob_storage_path}"
        if base_path
        else azure_blob_storage_path
    )

    return container_client, azure_blob_storage_path

In [None]:
actual = create_azure_blob_storage_datablob_path(
    user_id=999, datablob_id=999, region="westeurope"
)
display(actual)
expected = "999/datablob/999"
_, base_path = get_azure_blob_storage_container(region="westeurope")
expected = f"{base_path}/{expected}" if base_path else expected

assert actual[0]
assert actual[1] == expected

# bucket = actual[0]
# display(bucket.meta.client.head_object(Bucket=bucket.name, Key="999/datasource/999/result.json"))

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS


(<azure.storage.blob._container_client.ContainerClient at 0x7f6cfc432be0>,
 '999/datablob/999')

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS


In [None]:
#| export


def create_azure_blob_storage_datasource_path(
    user_id: int, datasource_id: int, region: str
) -> Tuple[ContainerClient, str]:
    """Create an azure blob storage path to store the datasources

    Args:
        user_id: User id
        datasource_id: Datasource id to store

    Returns:
        The root container client object and the azure blob storage path as a tuple
    """
    container_client, base_path = get_azure_blob_storage_container(region=region)
    azure_blob_storage_path = f"{user_id}/datasource/{datasource_id}"
    azure_blob_storage_path = (
        f"{base_path}/{azure_blob_storage_path}"
        if base_path
        else azure_blob_storage_path
    )

    return container_client, azure_blob_storage_path

In [None]:
actual = create_azure_blob_storage_datasource_path(
    user_id=999, datasource_id=999, region="westeurope"
)
display(actual)
expected = "999/datasource/999"
_, base_path = get_azure_blob_storage_container(region="westeurope")
expected = f"{base_path}/{expected}" if base_path else expected


assert actual[0]
assert actual[1] == expected

# bucket = actual[0]
# display(bucket.meta.client.head_object(Bucket=bucket.name, Key="999/datasource/999/result.json"))

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS


(<azure.storage.blob._container_client.ContainerClient at 0x7f6cfc38f520>,
 '999/datasource/999')

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS


In [None]:
#| export


def create_azure_blob_storage_prediction_path(
    user_id: int, prediction_id: int, region: str
) -> Tuple[ContainerClient, str]:
    """Create an S3 path to store the prediction results

    Args:
        user_id: User id
        prediction_id: Prediction id

    Returns:
        The root storage bucket object and the s3 path as a tuple
    """
    container_client, base_path = get_azure_blob_storage_container(region=region)
    azure_blob_storage_path = f"{user_id}/prediction/{prediction_id}"
    azure_blob_storage_path = (
        f"{base_path}/{azure_blob_storage_path}"
        if base_path
        else azure_blob_storage_path
    )

    return container_client, azure_blob_storage_path

In [None]:
actual = create_azure_blob_storage_prediction_path(
    user_id=999, prediction_id=999, region="westeurope"
)
display(actual)
expected = "999/prediction/999"
_, base_path = get_azure_blob_storage_container(region="westeurope")
expected = f"{base_path}/{expected}" if base_path else expected

assert actual[0]
assert actual[1] == expected

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS


(<azure.storage.blob._container_client.ContainerClient at 0x7f6cfc425370>,
 '999/prediction/999')

[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS
[INFO] azure.identity._credentials.chained: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.default: DefaultAzureCredential acquired a token from EnvironmentCredential
[INFO] azure.identity._credentials.environment: Environment is configured for ClientSecretCredential
[INFO] azure.identity._credentials.managed_identity: ManagedIdentityCredential will use IMDS


In [None]:
#| export


def get_azure_batch_environment_component_names(
    region: str, batch_environment_path: Optional[Union[str, Path]] = None
) -> Dict[str, Dict[str, str]]:
    """Read the batch environment yaml file and return as a dict

    Args:
        region: Region to get batch environment names
        batch_environment_path: Path to the yaml file with azure batch environment names. If not set, then the batch_environment
            will be loaded from the current working directory

    Returns:
        The created batch environment names as a dict
    """
    if batch_environment_path is None:
        batch_environment_path = Path("./azure_batch_environment.yml")
    with open(batch_environment_path) as f:
        batch_environment_names = yaml.safe_load(f)

    # ToDo: For now we have azure batch environment only for northeurope region. Fix this once we have more regions
    return batch_environment_names["northeurope"]

In [None]:
region = "northeurope"
test_batch_environment_names = {
    region: {
        task: {
            arn: "random_azure_batch_env_component_name"
            for arn in [
                "batch_job_name",
                "batch_pool_name",
                "batch_account_name",
            ]
        }
        for task in ["csv_processing", "predictions", "preprocessing", "training"]
    }
}

with tempfile.TemporaryDirectory() as td:
    td = Path(td)
    test_batch_environment_path = td / "azure_batch_environment.yml"
    with open(test_batch_environment_path, "w") as f:
        yaml.dump(test_batch_environment_names, f, default_flow_style=False)
    actual = get_azure_batch_environment_component_names(
        region=region, batch_environment_path=test_batch_environment_path
    )
    display(actual)
    assert actual == test_batch_environment_names[region]

{'csv_processing': {'batch_account_name': 'random_azure_batch_env_component_name',
  'batch_job_name': 'random_azure_batch_env_component_name',
  'batch_pool_name': 'random_azure_batch_env_component_name'},
 'predictions': {'batch_account_name': 'random_azure_batch_env_component_name',
  'batch_job_name': 'random_azure_batch_env_component_name',
  'batch_pool_name': 'random_azure_batch_env_component_name'},
 'preprocessing': {'batch_account_name': 'random_azure_batch_env_component_name',
  'batch_job_name': 'random_azure_batch_env_component_name',
  'batch_pool_name': 'random_azure_batch_env_component_name'},
 'training': {'batch_account_name': 'random_azure_batch_env_component_name',
  'batch_job_name': 'random_azure_batch_env_component_name',
  'batch_pool_name': 'random_azure_batch_env_component_name'}}

In [None]:
#| export


def get_batch_account_pool_job_names(
    task: str,
    region: str,
    batch_environment_path: Optional[Union[str, Path]] = None,
) -> Tuple[str, str, str]:
    """Get the job queue arn and the job definition arn for the given task

    Args:
        task: Task name
        region: Region to get component names
        batch_environment_path: Path to the yaml file with azure batch environment names. If not set, then the batch_environment
            will be loaded from the current working directory
    Returns:
        A tuple which consists of batch account name, batch pool name, batch job name for the given task and region
    """
    batch_environment_component_names = get_azure_batch_environment_component_names(
        region=region, batch_environment_path=batch_environment_path
    )
    batch_account_name = batch_environment_component_names[task]["batch_account_name"]
    batch_pool_name = batch_environment_component_names[task]["batch_pool_name"]
    batch_job_name = batch_environment_component_names[task]["batch_job_name"]

    return batch_account_name, batch_pool_name, batch_job_name

In [None]:
region = "northeurope"
with tempfile.TemporaryDirectory() as td:
    td = Path(td)

    test_batch_environment_path = td / "azure_batch_environment.yml"
    with open(test_batch_environment_path, "w") as f:
        yaml.dump(test_batch_environment_names, f, default_flow_style=False)

    task = "csv_processing"
    (
        actual_batch_account_name,
        actual_batch_pool_name,
        actual_batch_job_name,
    ) = get_batch_account_pool_job_names(
        task=task,
        region=region,
        batch_environment_path=test_batch_environment_path,
    )

    assert (
        actual_batch_account_name
        == test_batch_environment_names[region][task]["batch_account_name"]
    )
    assert (
        actual_batch_pool_name
        == test_batch_environment_names[region][task]["batch_pool_name"]
    )
    assert (
        actual_batch_job_name
        == test_batch_environment_names[region][task]["batch_job_name"]
    )