In [None]:
#| default_exp aws.batch_utils

In [None]:
from airt.testing import activate_by_import

[INFO] airt.testing.activate_by_import: Testing environment activated.
[INFO] numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] numexpr.utils: NumExpr defaulting to 8 threads.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
#| exporti

import random
import shlex
import string
import yaml
from contextlib import contextmanager, ContextDecorator
from os import environ
from pathlib import Path
from time import sleep
from typing import *

import boto3
from fastcore.script import call_parse, Param
from fastcore.utils import patch

from airt_service.sanitizer import sanitized_print
from airt.helpers import ensure
from airt.logger import get_logger
from airt_service.aws.utils import get_available_aws_regions

In [None]:
import tempfile

In [None]:
#| exporti

logger = get_logger(__name__)

In [None]:
#| export


def get_random_string(length: int = 6) -> str:
    """Generate random string

    Args:
        length: Random string length to generate

    Returns:
        A random string of given length
    """
    return "".join(
        random.choice(string.ascii_uppercase + string.digits)  # nosec B311
        for _ in range(length)
    )

In [None]:
actual = get_random_string(length=10)
display(actual)
assert isinstance(actual, str)
assert len(actual) == 10

'7UREUJL9U2'

In [None]:
#| export


def get_instance_info(instance_type: str, region: str) -> Tuple[int, int, int]:
    """Get the instance VCPU count, memory, GPU count for the given instance_type

    Args:
        instance_type: Instance type as a string

    Returns:
        The VCPU count, memory, and GPU count as a tuple
    """
    client = boto3.client("ec2", region_name=region)
    response = client.describe_instance_types(
        InstanceTypes=[instance_type],
    )
    if len(response["InstanceTypes"]) == 0:
        raise ValueError(f"{len(response['InstanceTypes'])=}")
    instance_details = response["InstanceTypes"][0]

    vcpus = instance_details["VCpuInfo"]["DefaultVCpus"]
    memory = instance_details["MemoryInfo"]["SizeInMiB"]
    gpu = (
        len(instance_details["GpuInfo"]["Gpus"]) if "GpuInfo" in instance_details else 0
    )
    return vcpus, memory, gpu

In [None]:
expected = (4, 16384, 1)
actual = get_instance_info(instance_type="g4dn.xlarge", region="eu-west-1")
display(actual)
assert actual == expected

[INFO] botocore.credentials: Found credentials in environment variables.


(4, 16384, 1)

In [None]:
#| export

def get_availability_zones_and_subnets_in_region(region: str) -> Dict[str, str]:
    """Get subnet ids and its availability zones available in given region
    
    Args:
        region: region to get subnets
    Returns:
        A dict with avilability zones as key and subnet ids as value
    """
    client = boto3.client("ec2", region_name=region)
    subnet_response = client.describe_subnets()['Subnets']
    
    subnets = {}
    for subnet in subnet_response:
        subnets[subnet["AvailabilityZone"]] = subnet["SubnetId"]
    
    return subnets

In [None]:
expected = ["subnet-aa2974e2", "subnet-19d65943", "subnet-f296d494"]
actual = get_availability_zones_and_subnets_in_region(region="eu-west-1")
display(actual)

assert sorted(expected) == sorted(actual.values())

{'eu-west-1c': 'subnet-aa2974e2',
 'eu-west-1a': 'subnet-19d65943',
 'eu-west-1b': 'subnet-f296d494'}

In [None]:
#| export


def get_subnets_with_instance_availability(
    region: str, instance_types: Optional[List[str]] = None
) -> List[str]:
    """Get subnets where given instance type are available for offering

    Args:
        region: region to get subnets
        instance_types: list of instance types
    Returns:
        A list of subnets where given instance types are available
    """
    if instance_types is None:
        instance_types = ["g4dn.xlarge", "r5.16xlarge"]
    zones_and_subnets = get_availability_zones_and_subnets_in_region(region)

    client = boto3.client("ec2", region_name=region)

    available_zones_for_instance = []
    for instance_type in instance_types:
        offerings = client.describe_instance_type_offerings(
            LocationType="availability-zone",
            Filters=[{"Name": "instance-type", "Values": [instance_type]}],
        )
        availability_zones = [
            instance_details["Location"]
            for instance_details in offerings["InstanceTypeOfferings"]
        ]
        available_zones_for_instance.append(set(availability_zones))

    common_availability_zones = set.intersection(*available_zones_for_instance)
    return [
        zones_and_subnets[availability_zone]
        for availability_zone in common_availability_zones
    ]

In [None]:
expected = ['subnet-f296d494', 'subnet-19d65943', 'subnet-aa2974e2']
actual = get_subnets_with_instance_availability("eu-west-1")
display(actual)
assert sorted(actual) == sorted(expected)

['subnet-f296d494', 'subnet-19d65943', 'subnet-aa2974e2']

In [None]:
#| export

def get_default_security_group_id(region: str) -> str:
    """Get default security group id for given region
    
    Args:
        region: region to get default security group id
    Returngs:
        Default security group id
    """
    client = boto3.client("ec2", region_name=region)
    security_groups = client.describe_security_groups(GroupNames=["default"])
    return security_groups["SecurityGroups"][0]["GroupId"]

In [None]:
expected = 'sg-5d3ee12b'
actual = get_default_security_group_id("eu-west-1")
display(actual)
assert actual == expected

'sg-5d3ee12b'

In [None]:
#| export


class ComputeEnvironment(ContextDecorator):
    """A class for creating and managing the compute environment"""

    def __init__(self, response, region):
        """Constructs a new ComputeEnvironment instance

        Args:
            response: The compute environment describe response
        """
        self.response = response
        self.region = region

    @property
    def arn(self) -> str:
        """Get ARN of the compute environment

        Returns:
            The ARN of the compute environment
        """
        return self.response["computeEnvironmentArn"]

    @property
    def name(self) -> str:
        """Get name of the compute environment

        Returns:
            The name of the compute environment
        """
        return self.response["computeEnvironmentName"]

    @property
    def instance_type(self) -> str:
        """Get instance type of the compute environment

        Returns:
            The instance type of the compute environment
        """
        return self.response["computeResources"]["instanceTypes"][0]

    @classmethod
    def from_name_or_arn(cls, name: str, region: str) -> "ComputeEnvironment":
        """Construct ComputeEnvironment object from name or from ARN

        Args:
            name: name or ARN of the compute environment

        Returns:
            The ComputeEnvironment object
        """
        client = boto3.client("batch", region_name=region)

        response = client.describe_compute_environments(
            computeEnvironments=[
                name,
            ],
        )

        if len(response["computeEnvironments"]) > 1:
            raise ValueError(f"{len(response['computeEnvironments'])=}")
        elif len(response["computeEnvironments"]) == 0:
            raise ValueError(f"{len(response['computeEnvironments'])=}")

        return ComputeEnvironment(response["computeEnvironments"][0], region)

    @classmethod
    def create(
        cls,
        *,
        name: Optional[str] = None,
        region: str,
        resource_type: str = "EC2",  # EC2 or SPOT
        allocation_strategey: str = "BEST_FIT",
        min_instances: int = 0,
        max_instances: int = 3,
        instance_type: str,
        subnets: Optional[List[str]] = None,
        security_group_ids: Optional[List[str]] = None,
        ec2_key_pair: Optional[str] = None,
        instance_role: str = "arn:aws:iam::617504802562:instance-profile/ecsInstanceRole",
        launch_template_name: str = "gitlab-registry-access-template",
    ) -> "ComputeEnvironment":
        """Create a new compute environment

        Args:
            name: Name of the compute environment
            resource_type: The type of the resource. Use 'EC2' for on-demand instances or 'SPOT' for spot instances
            allocation_strategey: Allocation strategy of the resource. If not set, the default value **BEST_FIT** will be used
                Other Valid options are ('BEST_FIT'|'BEST_FIT_PROGRESSIVE'|'SPOT_CAPACITY_OPTIMIZED')
            min_instances: Minimum instances to keep running in compute environment's ecs cluster
            max_instances: Maximum instances to scale up in compute environment's ecs cluster
            instance_type: Instance type to use to execute jobs
            subnets: subnets to use; default values are "subnet-19d65943" for eu-west-1a and "subnet-f296d494" for eu-west-1b
            security_group_ids: Security groups to use; default value is "sg-5d3ee12b" which is default security group of eu-west-1
            ec2_key_pair: EC2 key pair to use in spinned instances
            instance_role: ECS instance profile applied to Amazon EC2 instances in a compute environment
            launch_template_name: Launch template to use to spin up instances
        Returns:
            The newly created ComputeEnvironment object
        """
        client = boto3.client("batch", region_name=region)

        if name is None:
            name = f"compute-environment-{get_random_string()}"

        response = client.describe_compute_environments(
            computeEnvironments=[
                name,
            ],
        )

        if len(response["computeEnvironments"]) > 1:
            raise ValueError(f"{len(response['computeEnvironments'])=}")
        elif len(response["computeEnvironments"]) == 1:
            return ComputeEnvironment(response=response["computeEnvironments"][0], region=region)

        # ToDo: Remove default values for following and have a separate config dict for g4dn instance, r instance, etc in eu-west-1
        if subnets is None:
#             subnets = ["subnet-19d65943", "subnet-f296d494"]
            subnets = get_subnets_with_instance_availability(region)
        if security_group_ids is None:
#             security_group_ids = ["sg-5d3ee12b"]
            security_group_ids = [get_default_security_group_id(region)]

        vcpus, memory, gpu = get_instance_info(instance_type=instance_type, region=region)
        min_vcpus = min_instances * vcpus
        max_vcpus = max_instances * vcpus

        compute_resources = {
            "type": resource_type,
            "allocationStrategy": allocation_strategey,
            "minvCpus": min_vcpus,
            "maxvCpus": max_vcpus,
            "instanceTypes": [instance_type],
            "subnets": subnets,
            "securityGroupIds": security_group_ids,
            "instanceRole": instance_role,
            "launchTemplate": {
                "launchTemplateName": launch_template_name,
                "version": "$Default",
            },
        }
        if ec2_key_pair is not None:
            compute_resources["ec2KeyPair"] = ec2_key_pair

        response = client.create_compute_environment(
            computeEnvironmentName=name,
            type="MANAGED",
            state="ENABLED",
            computeResources=compute_resources,
        )
        compute_env = ComputeEnvironment(response=response, region=region)
        response = compute_env.wait(status="VALID", state="ENABLED")
        return ComputeEnvironment(response=response, region=region)

    def wait(
        self,
        status: str,
        state: str,
        timeout: int = 0,
        sleep_step: int = 1,
    ) -> Dict[str, Any]:
        """Wait until the compute environment reaches the given status and state

        Args:
            status: Status to wait for ('CREATING'|'UPDATING'|'DELETING'|'DELETED'|'VALID'|'INVALID')
            state: State to wait for ('ENABLED'|'DISABLED')
            timeout: The maximum time allowed in seconds for the command to complete. If greater than 0,
                then the command will be killed after the timeout
            sleep_step: The time interval in seconds to check the completion status of the command

        Returns:
            The response of describe compute environment
        """
        client = boto3.client("batch", region_name=self.region)

        i = 0
        while True:
            if 0 < timeout <= i:
                logger.info(f"wait timedout after {i:,d} seconds for arn: '{self.arn}'")
                break
            response = client.describe_compute_environments(
                computeEnvironments=[
                    self.arn,
                ],
            )
            logger.info(
                f'wait(): {self.arn=}, status={response["computeEnvironments"][0]["status"]}, state={response["computeEnvironments"][0]["state"]}'
            )
            if (
                response["computeEnvironments"][0]["status"] == status
                and response["computeEnvironments"][0]["state"] == state
            ):
                break
            sleep(sleep_step)
            i = i + sleep_step
        return response["computeEnvironments"][0]

    def update(self, *args, **kwargs):
        """Update compute environment"""
        client = boto3.client("batch", region_name=self.region)
        response = client.update_compute_environment(
            computeEnvironment=self.arn, *args, **kwargs
        )

    def delete(self):
        """Delete compute environment"""
        client = boto3.client("batch", region_name=self.region)
        response = client.delete_compute_environment(
            computeEnvironment=self.arn,
        )

    def __enter__(self):
        return self

    def __exit__(self, *exc):
        client = boto3.client("batch", region_name=self.region)
        self.update(state="DISABLED")
        self.wait(status="VALID", state="DISABLED")
        self.delete()
        return False

In [None]:
with ComputeEnvironment.create(instance_type="g4dn.xlarge", region="eu-west-1") as compute_env:
    display(compute_env.response)
    assert compute_env.name
    assert compute_env.instance_type == "g4dn.xlarge"
    assert compute_env.response["status"] == "VALID"
    assert compute_env.response["state"] == "ENABLED"

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MEJY2C', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MEJY2C', status=VALID, state=ENABLED


{'computeEnvironmentName': 'compute-environment-MEJY2C',
 'computeEnvironmentArn': 'arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MEJY2C',
 'ecsClusterArn': 'arn:aws:ecs:eu-west-1:617504802562:cluster/AWSBatch-compute-environment-MEJY2C-965157d3-a92b-3dc5-9e07-5fb2de0b3c0e',
 'tags': {},
 'type': 'MANAGED',
 'state': 'ENABLED',
 'status': 'VALID',
 'statusReason': 'ComputeEnvironment Healthy',
 'computeResources': {'type': 'EC2',
  'allocationStrategy': 'BEST_FIT',
  'minvCpus': 0,
  'maxvCpus': 12,
  'desiredvCpus': 0,
  'instanceTypes': ['g4dn.xlarge'],
  'subnets': ['subnet-f296d494', 'subnet-19d65943', 'subnet-aa2974e2'],
  'securityGroupIds': ['sg-5d3ee12b'],
  'instanceRole': 'arn:aws:iam::617504802562:instance-profile/ecsInstanceRole',
  'tags': {},
  'launchTemplate': {'launchTemplateName': 'gitlab-registry-access-template',
   'version': '$Default'},
  'ec2Configuration': [{'imageType': 'ECS_AL2_NVIDIA'}]},
 'serviceRole': 'arn:aws:iam::617504802

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MEJY2C', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MEJY2C', status=VALID, state=DISABLED


In [None]:
with ComputeEnvironment.create(name="same-name-test-diff", instance_type="g4dn.xlarge", region="eu-west-1") as compute_env:
    display(compute_env.response)
    assert compute_env.name
    assert compute_env.instance_type == "g4dn.xlarge"
    assert compute_env.response["status"] == "VALID"
    assert compute_env.response["state"] == "ENABLED"
    with ComputeEnvironment.create(name="same-name-test-diff", instance_type="g4dn.xlarge", region="us-west-1") as compute_env_us:
        display(compute_env_us.response)
        assert compute_env_us.name
        assert compute_env_us.instance_type == "g4dn.xlarge"
        assert compute_env_us.response["status"] == "VALID"
        assert compute_env_us.response["state"] == "ENABLED"

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/same-name-test-diff', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/same-name-test-diff', status=VALID, state=ENABLED


{'computeEnvironmentName': 'same-name-test-diff',
 'computeEnvironmentArn': 'arn:aws:batch:eu-west-1:617504802562:compute-environment/same-name-test-diff',
 'ecsClusterArn': 'arn:aws:ecs:eu-west-1:617504802562:cluster/AWSBatch-same-name-test-diff-783369ae-146c-32f1-aae9-813606427b66',
 'tags': {},
 'type': 'MANAGED',
 'state': 'ENABLED',
 'status': 'VALID',
 'statusReason': 'ComputeEnvironment Healthy',
 'computeResources': {'type': 'EC2',
  'allocationStrategy': 'BEST_FIT',
  'minvCpus': 0,
  'maxvCpus': 12,
  'desiredvCpus': 0,
  'instanceTypes': ['g4dn.xlarge'],
  'subnets': ['subnet-f296d494', 'subnet-19d65943', 'subnet-aa2974e2'],
  'securityGroupIds': ['sg-5d3ee12b'],
  'instanceRole': 'arn:aws:iam::617504802562:instance-profile/ecsInstanceRole',
  'tags': {},
  'launchTemplate': {'launchTemplateName': 'gitlab-registry-access-template',
   'version': '$Default'},
  'ec2Configuration': [{'imageType': 'ECS_AL2_NVIDIA'}]},
 'serviceRole': 'arn:aws:iam::617504802562:role/aws-service-

[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-1:617504802562:compute-environment/same-name-test-diff', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-1:617504802562:compute-environment/same-name-test-diff', status=VALID, state=ENABLED


{'computeEnvironmentName': 'same-name-test-diff',
 'computeEnvironmentArn': 'arn:aws:batch:us-west-1:617504802562:compute-environment/same-name-test-diff',
 'ecsClusterArn': 'arn:aws:ecs:us-west-1:617504802562:cluster/AWSBatch-same-name-test-diff-43facc96-e1df-3e1e-8b7d-d6a175b3dd0a',
 'tags': {},
 'type': 'MANAGED',
 'state': 'ENABLED',
 'status': 'VALID',
 'statusReason': 'ComputeEnvironment Healthy',
 'computeResources': {'type': 'EC2',
  'allocationStrategy': 'BEST_FIT',
  'minvCpus': 0,
  'maxvCpus': 12,
  'desiredvCpus': 0,
  'instanceTypes': ['g4dn.xlarge'],
  'subnets': ['subnet-6d53940b', 'subnet-3bf9ea60'],
  'securityGroupIds': ['sg-f40f858b'],
  'instanceRole': 'arn:aws:iam::617504802562:instance-profile/ecsInstanceRole',
  'tags': {},
  'launchTemplate': {'launchTemplateName': 'gitlab-registry-access-template',
   'version': '$Default'},
  'ec2Configuration': [{'imageType': 'ECS_AL2_NVIDIA'}]},
 'serviceRole': 'arn:aws:iam::617504802562:role/aws-service-role/batch.amazonaw

[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-1:617504802562:compute-environment/same-name-test-diff', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-1:617504802562:compute-environment/same-name-test-diff', status=VALID, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/same-name-test-diff', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/same-name-test-diff', status=VALID, state=DISABLED


In [None]:
#| export


class JobQueue(ContextDecorator):
    """A class for creating and managing the job queues"""

    def __init__(self, response, region: str):
        """Constructs a new Job Queue instance

        Args:
            response: The job queue describe response
        """
        self.response = response
        self.region = region

    @property
    def arn(self) -> str:
        """Get arn of the job queue

        Returns:
            The arn of job queue
        """
        return self.response["jobQueueArn"]

    @property
    def name(self) -> str:
        """Get name of the job queue

        Returns:
            The name of job queue
        """
        return self.response["jobQueueName"]

    @classmethod
    def from_name_or_arn(cls, name: str, region: str) -> "JobQueue":
        """Construct JobQueue object from name or ARN

        Args:
            name: The name or arn of job queue
            region: Region of job queue

        Returns:
            The JobQueue object
        """
        client = boto3.client("batch", region_name=region)

        response = client.describe_job_queues(
            jobQueues=[name],
        )

        if len(response["jobQueues"]) > 1:
            raise ValueError(f"{len(response['jobQueues'])=}")
        elif len(response["jobQueues"]) == 0:
            raise ValueError(f"{len(response['jobQueues'])=}")

        return JobQueue(response["jobQueues"][0], region)

    @classmethod
    def create(
        cls,
        *,
        name: Optional[str] = None,
        compute_environment: ComputeEnvironment,
        priority: int = 100,
    ) -> "JobQueue":
        """Create a new job queue

        Args:
            name: Name of the job queue
            compute_environment: The ComputeEnvironment object
            priority: Priority to assign to the job queue

        Returns:
            The newly constructed JobQueue object
        """
        client = boto3.client("batch", region_name=compute_environment.region)

        if name is None:
            name = f"job-queue-{get_random_string()}"
        response = client.describe_job_queues(
            jobQueues=[
                name,
            ],
        )

        if len(response["jobQueues"]) > 1:
            raise ValueError(f"{len(response['jobQueues'])=}")
        elif len(response["jobQueues"]) == 1:
            return JobQueue(response["jobQueues"][0], region=compute_environment.region)

        response = client.create_job_queue(
            jobQueueName=name,
            state="ENABLED",
            priority=priority,
            computeEnvironmentOrder=[
                {"order": 1, "computeEnvironment": compute_environment.arn},
            ],
        )

        job_queue = JobQueue(response=response, region=compute_environment.region)
        response = job_queue.wait(status="VALID", state="ENABLED")
        return JobQueue(response=response, region=compute_environment.region)

    def wait(
        self,
        status: Optional[str] = None,
        state: Optional[str] = None,
        is_deleted: bool = False,
        timeout: int = 0,
        sleep_step: int = 1,
    ) -> Union[None, Dict[str, Any]]:
        """Wait until the job queue reaches the given status and state or until job queue is deleted

        Args:
            status: Status to wait for('CREATING'|'UPDATING'|'DELETING'|'DELETED'|'VALID'|'INVALID')
            state: State to wait for('ENABLED'|'DISABLED')
            is_deleted: A flag indicating whether to wait for job queue deletion. If not set, then
                the default value **False** will be used.
            timeout: The maximum time allowed in seconds for the command to complete. If greater than 0,
                then the command will be killed after the timeout
            sleep_step: The time interval in seconds to check the completion status of the command

        Returns:
            The response of describe job queue
        """
        ensure(is_deleted != ((status is not None) or (state is not None)))
        ensure((status is None) == (state is None))
        client = boto3.client("batch", region_name=self.region)

        i = 0
        while True:
            if 0 < timeout <= i:
                logger.info(f"wait timedout after {i:,d} seconds for arn: '{self.arn}'")
                break
            response = client.describe_job_queues(
                jobQueues=[
                    self.arn,
                ],
            )
            if is_deleted and not response["jobQueues"]:
                logger.info(f"wait(): {self.arn=} deleted")
                return None
            logger.info(
                f'wait(): {self.arn=}, status={response["jobQueues"][0]["status"]}, state={response["jobQueues"][0]["state"]}'
            )
            if (
                response["jobQueues"][0]["status"] == status
                and response["jobQueues"][0]["state"] == state
            ):
                break
            sleep(sleep_step)
            i = i + sleep_step
        return response["jobQueues"][0]

    def update(self, *args, **kwargs):
        """Update job queue"""
        client = boto3.client("batch", region_name=self.region)
        response = client.update_job_queue(jobQueue=self.arn, *args, **kwargs)

    def delete(self):
        """Delete job queue"""
        client = boto3.client("batch", region_name=self.region)
        response = client.delete_job_queue(
            jobQueue=self.arn,
        )

    def __enter__(self):
        return self

    def __exit__(self, *exc):
        client = boto3.client("batch", region_name=self.region)
        self.update(state="DISABLED")
        self.wait(status="VALID", state="DISABLED")
        self.delete()
        self.wait(is_deleted=True)
        return False

In [None]:
#| export


@patch
def create_job_queue(
    self: ComputeEnvironment, *, name: Optional[str] = None, priority: int = 100
):
    return JobQueue.create(name=name, compute_environment=self, priority=priority)

In [None]:
for region in ["eu-west-1", "us-west-2"]:
    with ComputeEnvironment.create(instance_type="g4dn.xlarge", region=region) as compute_env:
        with compute_env.create_job_queue() as job_queue:
            display(job_queue.response)
            assert job_queue.name
            assert job_queue.response["status"] == "VALID"
            assert job_queue.response["state"] == "ENABLED"

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-U6Z8O2', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-U6Z8O2', status=VALID, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=VALID, state=ENABLED


{'jobQueueName': 'job-queue-IXP0P5',
 'jobQueueArn': 'arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5',
 'state': 'ENABLED',
 'status': 'VALID',
 'statusReason': 'JobQueue Healthy',
 'priority': 100,
 'computeEnvironmentOrder': [{'order': 1,
   'computeEnvironment': 'arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-U6Z8O2'}],
 'tags': {}}

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=VALID, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-IXP0P5', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:

{'jobQueueName': 'job-queue-YHU5KG',
 'jobQueueArn': 'arn:aws:batch:us-west-2:617504802562:job-queue/job-queue-YHU5KG',
 'state': 'ENABLED',
 'status': 'VALID',
 'statusReason': 'JobQueue Healthy',
 'priority': 100,
 'computeEnvironmentOrder': [{'order': 1,
   'computeEnvironment': 'arn:aws:batch:us-west-2:617504802562:compute-environment/compute-environment-QRAUVP'}],
 'tags': {}}

[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-2:617504802562:job-queue/job-queue-YHU5KG', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-2:617504802562:job-queue/job-queue-YHU5KG', status=VALID, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-2:617504802562:job-queue/job-queue-YHU5KG' deleted
[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-2:617504802562:compute-environment/compute-environment-QRAUVP', status=VALID, state=DISABLED


In [None]:
#| export


def get_max_vcpus_memory_for_container(host_vcpus: int, host_memory: int) -> Tuple[int, int]:
    """Get maximum VCPUs and memory that can be allocated to the container from available host VCPUs and memory

    Args:
        host_vcpus: available VCPUs on the host
        host_memory: available memory on the host(in MiB)

    Returns:
        The max VCPUs and max memory that can be allocated to the container as a tuple
    """

    if host_vcpus <= 4:
        max_container_vcpus = max(1, host_vcpus - 1)
    else:
        max_container_vcpus = host_vcpus - 2
    max_container_memory = host_memory - (host_vcpus * 256)
    return max_container_vcpus, max_container_memory

In [None]:
test_cases = {
    "g4dn.xlarge": {
        "host_vcpus": 4,
        "host_memory": 16384,
        "expected_max_container_vcpus": 3,
        "expected_max_container_memory": 15360,
    },
    "r5.8xlarge": {
        "host_vcpus": 32,
        "host_memory": 262144,
        "expected_max_container_vcpus": 30,
        "expected_max_container_memory": 253952,
    },
    "r5.16xlarge": {
        "host_vcpus": 64,
        "host_memory": 524288,
        "expected_max_container_vcpus": 62,
        "expected_max_container_memory": 507904,
    },
}

for instance, info in test_cases.items():
    (
        actual_max_container_vcpus,
        actual_max_container_memory,
    ) = get_max_vcpus_memory_for_container(
        host_vcpus=info["host_vcpus"], host_memory=info["host_memory"]
    )
    assert (
        actual_max_container_vcpus == info["expected_max_container_vcpus"]
    ), actual_max_container_vcpus
    assert (
        actual_max_container_memory == info["expected_max_container_memory"]
    ), actual_max_container_memory
    display(
        f'{instance} has host_vcpus={info["host_vcpus"]}, host_memory={info["host_memory"]} and max_container_vcpus={actual_max_container_vcpus}, max_container_memory={actual_max_container_memory}'
    )

'g4dn.xlarge has host_vcpus=4, host_memory=16384 and max_container_vcpus=3, max_container_memory=15360'

'r5.8xlarge has host_vcpus=32, host_memory=262144 and max_container_vcpus=30, max_container_memory=253952'

'r5.16xlarge has host_vcpus=64, host_memory=524288 and max_container_vcpus=62, max_container_memory=507904'

In [None]:
#| export


class JobDefinition(ContextDecorator):
    """A class for creating and managing the job definition"""

    def __init__(self, response, region: str):
        """Constructs a new JobDefinition instance

        Args:
            response: job definition describe response
        """
        self.response = response
        self.region = region

    @property
    def arn(self) -> str:
        """Get ARN of the job definition

        Returns:
            The ARN of job definition
        """
        return self.response["jobDefinitionArn"]

    @property
    def name(self) -> str:
        """Get name of the job definition

        Returns:
            The name of job definition
        """
        return self.response["jobDefinitionName"]

    @classmethod
    def from_name_or_arn(cls, name: str, region: str) -> "JobDefinition":
        """Construct the JobDefinition object from name or from ARN

        Args:
            name: Name or ARN of job defintion
            region: Region of job definition

        Returns:
            The JobDefinition object
        """
        client = boto3.client("batch", region_name=region)

        response = client.describe_job_definitions(
            jobDefinitions=[name],
        )

        if len(response["jobDefinitions"]) > 1:
            raise ValueError(f"{len(response['jobDefinitions'])=}")
        elif len(response["jobDefinitions"]) == 0:
            raise ValueError(f"{len(response['jobDefinitions'])=}")

        return JobDefinition(response["jobDefinitions"][0], region)

    @classmethod
    def create(
        cls,
        *,
        name: Optional[str] = None,
        image: str,
        job_role_arn: str = "arn:aws:iam::617504802562:role/ecsTaskExecutionRole",
        execution_role_arn: str = "arn:aws:iam::617504802562:role/ecsTaskExecutionRole",
        compute_environment: ComputeEnvironment,
        command: Optional[str] = None,
        environment_vars: Optional[Dict[str, str]] = None,
        retries: int = 3,
    ) -> "JobDefinition":
        """
        Create job definition

        Args:
            name: Name of the job definition
            image: Image to start container
            job_role_arn: ARN of the IAM role that the container can assume for AWS permissions
            execution_role_arn: ARN of the execution role that batch job can assume
            compute_environment: ComputeEnvironment object
            command: Command to execute after starting container
            environment_vars: Environment vars to set in the container
            retries: Times to retry if the job fails (includes first execution)

        Returns:
            The JobDefinition object
        """
        client = boto3.client("batch", region_name=compute_environment.region)

        if name is None:
            name = f"job-definition-{get_random_string()}"
        response = client.describe_job_definitions(
            maxResults=1,
            jobDefinitionName=name,
        )

        if len(response["jobDefinitions"]) > 1:
            raise ValueError(f"{len(response['jobDefinitions'])=}")
        elif len(response["jobDefinitions"]) == 1:
            return JobDefinition(response["jobDefinitions"][0], region=compute_environment.region)

        vcpus, memory, gpu = get_instance_info(
            instance_type=compute_environment.instance_type, region=compute_environment.region
        )
        container_vcpus, container_memory = get_max_vcpus_memory_for_container(
            host_vcpus=vcpus, host_memory=memory
        )
        container_properties = {
            "image": image,
            "jobRoleArn": job_role_arn,
            "executionRoleArn": execution_role_arn,
            "resourceRequirements": [
                dict(value=str(container_vcpus), type="VCPU"),
                dict(value=str(container_memory), type="MEMORY"),
            ],
            "logConfiguration": {
                "logDriver": "awslogs",
            },
        }
        if gpu > 0:
            container_properties["resourceRequirements"].append(  # type: ignore
                dict(value=str(gpu), type="GPU")
            )
        if command is not None:
            container_properties["command"] = shlex.split(command)
        if environment_vars is not None:
            container_properties["environment"] = [
                dict(name=name, value=value) for name, value in environment_vars.items()
            ]

        response = client.register_job_definition(
            jobDefinitionName=name,
            type="container",
            containerProperties=container_properties,
            retryStrategy={
                "attempts": retries,
            },
            platformCapabilities=[
                "EC2",
            ],
        )

        job_definition = JobDefinition(response=response, region=compute_environment.region)
        response = job_definition.wait(status="ACTIVE")
        return JobDefinition(response=response, region=compute_environment.region)

    def wait(
        self,
        status: str,
        timeout: int = 0,
        sleep_step: int = 1,
    ) -> Dict[str, Any]:
        """
        Wait until job definition reaches the given status

        Args:
            status: Status to wait for('ACTIVE'|'INACTIVE')
            timeout: The maximum time allowed in seconds for the command to complete. If greater than 0,
                then the command will be killed after the timeout
            sleep_step: The time interval in seconds to check the completion status of the command

        Returns:
            The response of describe job definition
        """
        client = boto3.client("batch", region_name=self.region)

        i = 0
        while True:
            if 0 < timeout <= i:
                logger.info(f"wait timedout after {i:,d} seconds for arn: '{self.arn}'")
                break
            response = client.describe_job_definitions(
                jobDefinitions=[self.arn],
            )
            logger.info(
                f'wait(): {self.arn=}, status={response["jobDefinitions"][0]["status"]}'
            )
            if response["jobDefinitions"][0]["status"] == status:
                break
            sleep(sleep_step)
            i = i + sleep_step
        return response["jobDefinitions"][0]

    def delete(self):
        """Delete job definition"""
        client = boto3.client("batch", region_name=self.region)
        response = client.deregister_job_definition(
            jobDefinition=self.arn,
        )

    def __enter__(self):
        return self

    def __exit__(self, *exc):
        client = boto3.client("batch", region_name=self.region)
        self.delete()
        self.wait(status="INACTIVE")
        return False

In [None]:
#| export


@patch
def create_job_definition(
    self: ComputeEnvironment,
    *,
    name: Optional[str] = None,
    image: str,
    job_role_arn: str = "arn:aws:iam::617504802562:role/ecsTaskExecutionRole",
    execution_role_arn: str = "arn:aws:iam::617504802562:role/ecsTaskExecutionRole",
    command: Optional[str] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    retries: int = 3,
):
    return JobDefinition.create(
        name=name,
        compute_environment=self,
        image=image,
        job_role_arn=job_role_arn,
        execution_role_arn=execution_role_arn,
        command=command,
        environment_vars=environment_vars,
        retries=retries,
    )

In [None]:
with ComputeEnvironment.create(instance_type="g4dn.xlarge", region="eu-west-1") as compute_env:
    with compute_env.create_job_definition(
        image="busybox", command="sleep 10"
    ) as job_definition:
        display(job_definition.response)
        assert job_definition.name
        assert job_definition.response["status"] == "ACTIVE"

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MBHWRX', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MBHWRX', status=VALID, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-definition/job-definition-L2SO4X:1', status=ACTIVE


{'jobDefinitionName': 'job-definition-L2SO4X',
 'jobDefinitionArn': 'arn:aws:batch:eu-west-1:617504802562:job-definition/job-definition-L2SO4X:1',
 'revision': 1,
 'status': 'ACTIVE',
 'type': 'container',
 'parameters': {},
 'retryStrategy': {'attempts': 3, 'evaluateOnExit': []},
 'containerProperties': {'image': 'busybox',
  'command': ['sleep', '10'],
  'jobRoleArn': 'arn:aws:iam::617504802562:role/ecsTaskExecutionRole',
  'executionRoleArn': 'arn:aws:iam::617504802562:role/ecsTaskExecutionRole',
  'volumes': [],
  'environment': [],
  'mountPoints': [],
  'ulimits': [],
  'resourceRequirements': [{'value': '3', 'type': 'VCPU'},
   {'value': '15360', 'type': 'MEMORY'},
   {'value': '1', 'type': 'GPU'}],
  'logConfiguration': {'logDriver': 'awslogs',
   'options': {},
   'secretOptions': []},
  'secrets': []},
 'tags': {},
 'platformCapabilities': ['EC2']}

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-definition/job-definition-L2SO4X:1', status=INACTIVE
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MBHWRX', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-MBHWRX', status=VALID, state=DISABLED


In [None]:
#| export


class Job(ContextDecorator):
    """A class for creating and managing the jobs"""

    def __init__(self, response, region: str):
        """Constructs a new Job instance

        Args:
            response: job describe response
            region: region of job
        """
        self.response = response
        self.region = region

    @property
    def arn(self) -> str:
        """Get ARN of the job

        Returns:
            ARN of the job
        """
        return self.response["jobArn"]

    @property
    def name(self) -> str:
        """Get name of the job

        Returns:
            name of the job
        """
        return self.response["jobName"]

    @property
    def job_id(self) -> str:
        """Get job id of the job

        Returns:
            job id of the job
        """
        return self.response["jobId"]

    @classmethod
    def from_job_id(cls, job_id: str, region: str) -> "Job":
        """Construct Job object from job id

        Args:
            job_id: id of the job
            region: region of the job

        Returns:
            The Job object
        """
        client = boto3.client("batch", region_name=region)

        response = client.describe_jobs(
            jobs=[job_id],
        )

        if len(response["jobs"]) > 1:
            raise ValueError(f"{len(response['jobs'])=}")
        elif len(response["jobs"]) == 0:
            raise ValueError(f"{len(response['jobs'])=}")

        return Job(response["jobs"][0], region)

    @classmethod
    def create(
        cls,
        *,
        name: Optional[str] = None,
        job_queue: JobQueue,
        job_definition: JobDefinition,
        vcpus: Optional[int] = None,
        memory: Optional[int] = None,
        gpu: Optional[int] = None,
        command: Optional[str] = None,
        environment_vars: Optional[Dict[str, str]] = None,
        retries: Optional[int] = None,
    ) -> "Job":
        """
        Create job

        Args:
            name: Name of the job
            job_queue: JobQueue object
            job_definition: JobDefinition object
            vcpus: Overwrite VCPUs value in job definition
            memory: Overwrite memory value in job definition
            gpu: Overwrite GPU value in job definition
            command: Command to execute after starting container
            environment_vars: Environment vars to set in the container
            retries: Times to retry if job fails (includes first execution)

        Returns:
            The Job object
        """
        client = boto3.client("batch", region_name=job_definition.region)

        if name is None:
            name = f"job-{get_random_string()}"

        container_overrides: Dict[str, Any] = {"resourceRequirements": []}

        if vcpus is not None:
            container_overrides["resourceRequirements"].append(
                dict(value=str(vcpus), type="VCPU")
            )
        if memory is not None:
            container_overrides["resourceRequirements"].append(
                dict(value=str(memory), type="MEMORY")
            )
        if gpu is not None:
            container_overrides["resourceRequirements"].append(
                dict(value=str(gpu), type="GPU")
            )
        if command is not None:
            container_overrides["command"] = shlex.split(command)
        if environment_vars is not None:
            container_overrides["environment"] = [
                dict(name=name, value=value) for name, value in environment_vars.items()
            ]

        params = dict(
            jobName=name,
            jobQueue=job_queue.arn,
            jobDefinition=job_definition.arn,
            containerOverrides=container_overrides,
        )

        if retries is not None:
            params["retryStrategy"] = dict(attempts=retries)

        response = client.submit_job(**params)

        return Job(response, job_definition.region)

    def wait(
        self,
        status: str,
        timeout: int = 0,
        sleep_step: int = 1,
    ) -> Dict[str, Any]:
        """
        Wait until job reaches the given status or until it fails

        Args:
            status: Status to wait for('SUBMITTED'|'PENDING'|'RUNNABLE'|'STARTING'|'RUNNING'|'SUCCEEDED'|'FAILED')
            timeout: The maximum time allowed in seconds for the command to complete. If greater than 0,
                then the command will be killed after the timeout
            sleep_step: The time interval in seconds to check the completion status of the command

        Returns:
            response of describe job
        """
        client = boto3.client("batch", region_name=self.region)

        i = 0
        while True:
            if 0 < timeout <= i:
                logger.info(f"wait timedout after {i:,d} seconds for arn: '{self.arn}'")
                break
            response = client.describe_jobs(
                jobs=[self.job_id],
            )
            logger.info(
                f'wait(): {self.job_id=}, status={response["jobs"][0]["status"]}'
            )
            if response["jobs"][0]["status"] == status:
                break
            elif response["jobs"][0]["status"] == "FAILED":
                raise ValueError(f'{response["jobs"][0]["status"]=}')
            sleep(sleep_step)
            i = i + sleep_step
        return response["jobs"][0]

    def delete(self):
        """Delete job"""
        client = boto3.client("batch", region_name=self.region)
        response = client.terminate_job(
            jobId=self.job_id,
        )

    def __enter__(self):
        return self

    def __exit__(self, *exc):
        client = boto3.client("batch", region_name=self.region)
        self.wait(status="SUCCEEDED")
        return False

In [None]:
#| export


@patch
def create_job(
    self: JobQueue,
    *,
    name: Optional[str] = None,
    job_definition: JobDefinition,
    vcpus: Optional[int] = None,
    memory: Optional[int] = None,
    gpu: Optional[int] = None,
    command: Optional[str] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    retries: Optional[int] = None,
):
    return Job.create(
        name=name,
        job_queue=self,
        job_definition=job_definition,
        vcpus=vcpus,
        memory=memory,
        gpu=gpu,
        command=command,
        environment_vars=environment_vars,
        retries=retries,
    )

In [None]:
#| export


@patch  # type: ignore
def create_job(
    self: JobDefinition,
    *,
    name: Optional[str] = None,
    job_queue: JobQueue,
    vcpus: Optional[int] = None,
    memory: Optional[int] = None,
    gpu: Optional[int] = None,
    command: Optional[str] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    retries: Optional[int] = None,
):
    return Job.create(
        name=name,
        job_queue=job_queue,
        job_definition=self,
        vcpus=vcpus,
        memory=memory,
        gpu=gpu,
        command=command,
        environment_vars=environment_vars,
        retries=retries,
    )

In [None]:
# | eval: false


with ComputeEnvironment.create(instance_type="g4dn.xlarge", region="eu-west-1") as compute_env:
    with compute_env.create_job_queue() as job_queue:
        with compute_env.create_job_definition(
            image="busybox",
            command="sleep 10",
        ) as job_definition:
            with job_definition.create_job(job_queue=job_queue) as job:
                display(job.response)
                assert job.name

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-Z1LQOY', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/compute-environment-Z1LQOY', status=VALID, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=VALID, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-definition/job-definition-U6F26H:1', status=ACTIVE


{'ResponseMetadata': {'RequestId': 'dcbbceb8-c796-4029-bfc9-78f6464358a7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 20 Oct 2022 06:52:12 GMT',
   'content-type': 'application/json',
   'content-length': '160',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'dcbbceb8-c796-4029-bfc9-78f6464358a7',
   'access-control-allow-origin': '*',
   'x-amz-apigw-id': 'aSp0cGNGDoEFcWA=',
   'access-control-expose-headers': 'X-amzn-errortype,X-amzn-requestid,X-amzn-errormessage,X-amzn-trace-id,X-amz-apigw-id,date',
   'x-amzn-trace-id': 'Root=1-6350f01c-74569c014a1ca27b327bbfd5'},
  'RetryAttempts': 0},
 'jobArn': 'arn:aws:batch:eu-west-1:617504802562:job/ee9471dd-f89b-4e19-af1c-aa23215b7462',
 'jobName': 'job-L4UMS7',
 'jobId': 'ee9471dd-f89b-4e19-af1c-aa23215b7462'}

[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa23215b7462', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='ee9471dd-f89b-4e19-af1c-aa2321

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/job-queue-Y90LVP', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:6175048025

In [None]:
AIRT_SERVICE_IMAGE="registry.gitlab.com/airt.ai/airt-service"

In [None]:
# | eval: false

# for region in ["eu-west-1", "eu-west-3", "ap-south-1"]:
for region in ["ap-south-1"]:
    display(f"The region we are executing is {region}")
    with ComputeEnvironment.create(instance_type="g4dn.xlarge", region=region) as compute_env:
        with compute_env.create_job_queue() as job_queue:
            with compute_env.create_job_definition(
                image=AIRT_SERVICE_IMAGE,
                command="sleep 10",
            ) as job_definition:
                with job_definition.create_job(job_queue=job_queue) as job:
                    display(job.response)
                    assert job.name
    display("*"*100)

'The region we are executing is ap-south-1'

[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:compute-environment/compute-environment-PK93WH', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:compute-environment/compute-environment-PK93WH', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:compute-environment/compute-environment-PK93WH', status=VALID, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=VALID, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-definition/job-definition-OI7GPT:1', status=ACTIVE


{'ResponseMetadata': {'RequestId': 'b602f551-46c1-4b80-99c1-90c38487a789',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 20 Oct 2022 06:55:01 GMT',
   'content-type': 'application/json',
   'content-length': '161',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'b602f551-46c1-4b80-99c1-90c38487a789',
   'access-control-allow-origin': '*',
   'x-amz-apigw-id': 'aSqO4Hr-BcwFdIw=',
   'access-control-expose-headers': 'X-amzn-errortype,X-amzn-requestid,X-amzn-errormessage,X-amzn-trace-id,X-amz-apigw-id,date',
   'x-amzn-trace-id': 'Root=1-6350f0c5-2d86f5640374c7e85392eed5'},
  'RetryAttempts': 0},
 'jobArn': 'arn:aws:batch:ap-south-1:617504802562:job/18323046-b2d8-4878-ad5a-66f5b0310290',
 'jobName': 'job-YCJ284',
 'jobId': '18323046-b2d8-4878-ad5a-66f5b0310290'}

[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=SUBMITTED
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5

[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=RUNNABLE
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b031

[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b031

[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b031

[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b0310290', status=STARTING
[INFO] __main__: wait(): self.job_id='18323046-b2d8-4878-ad5a-66f5b031

[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:617504802562:job-queue/job-queue-4YNO66', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:ap-south-1:61

'****************************************************************************************************'

In [None]:
#| export


def aws_batch_create_job(  # type: ignore
    *,
    name: Optional[str] = None,
    job_queue_arn: str,
    job_definition_arn: str,
    region: str,
    vcpus: Optional[int] = None,
    memory: Optional[int] = None,
    gpu: Optional[int] = None,
    command: Optional[str] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    retries: Optional[int] = None,
) -> Job:
    """Create a new CLI job

    Args:
        name: Name of the job
        job_queue_arn: ARN of job queue
        job_definition_arn: ARN of job definition
        region: Region to create job
        vcpus: Overwrite VCPUs value in job definition
        memory: Overwrite memory value in job definition
        gpu: Overwrite GPU value in job definition
        command: Command to execute after starting container
        environment_vars: Environment vars to set in the container
        retries: Times to retry if job fails(includes first execution)

    Returns:
        A new CLI job
    """
    job_queue = JobQueue.from_name_or_arn(job_queue_arn, region)
    job_definition = JobDefinition.from_name_or_arn(job_definition_arn, region)

    job = Job.create(
        name=name,
        job_queue=job_queue,
        job_definition=job_definition,
        vcpus=vcpus,
        memory=memory,
        gpu=gpu,
        command=command,
        environment_vars=environment_vars,
        retries=retries,
    )
    logger.info(f"{job.arn=}")
    return job

In [None]:
#| export

def _create_default_batch_environment_config(
    prefix: str, output_path: Union[str, Path], regions: Optional[List[str]] = None
):
    """Generate batch environment YAML config to set up the batch job environment

    Args:
        prefix: Prefix to use in names
        output_path: Path of yaml file to store generated config
    """

    def _f(task_name, image, instance_type, prefix):
        return f"""
    {task_name}:
      compute_environment:
          name: {prefix}_{task_name}_compute_environment
          instance_type: {instance_type}
          min_instances: 0
          max_instances: 10
      job_queue:
          name: {prefix}_{task_name}_job_queue
          priority: 100
      job_definition:
          name: {prefix}_{task_name}_job_definition
          image: {image}
        """

    instance_types = dict(
        preprocessing="r5.16xlarge",
        training="g4dn.xlarge",
        predictions="g4dn.xlarge",
        csv_processing="r5.16xlarge",
    )

    image = "registry.gitlab.com/airt.ai/airt-service:dev"

    yaml_str = {}
    
    if not regions:
        regions = get_available_aws_regions()

    for region in regions:
        yaml_str[region] = yaml.safe_load(
            "\n".join(
                [
                    _f(task_name, image, instance_type, prefix)
                    for task_name, instance_type in instance_types.items()
                ]
            )
        )

    with open(output_path, "w") as f:
        yaml.dump(yaml_str, f, default_flow_style=False)

In [None]:
#| export


@call_parse
def create_default_batch_environment_config(
    prefix: Param("prefix", str), output_path: Param("output_path", str), regions: Param("regions", List[str]) = None  # type: ignore
):
    """Generate batch environment YAML config to set up the batch job environment

    Args:
        prefix: Prefix to use in names
        output_path: Path of yaml file to store generated config
    """

    _create_default_batch_environment_config(prefix=prefix, output_path=output_path, regions=regions)

In [None]:
with tempfile.TemporaryDirectory() as td:
    td = Path(td)
    create_default_batch_environment_config(
        prefix="testing", output_path=td / "output.yaml", regions=["eu-west-1", "us-west-1"]
    )
    assert (td / "output.yaml").exists()
    with open(td / "output.yaml") as f:
        d = yaml.safe_load(f)
    display(d)

{'eu-west-1': {'csv_processing': {'compute_environment': {'instance_type': 'r5.16xlarge',
    'max_instances': 10,
    'min_instances': 0,
    'name': 'testing_csv_processing_compute_environment'},
   'job_definition': {'image': 'registry.gitlab.com/airt.ai/airt-service:dev',
    'name': 'testing_csv_processing_job_definition'},
   'job_queue': {'name': 'testing_csv_processing_job_queue', 'priority': 100}},
  'predictions': {'compute_environment': {'instance_type': 'g4dn.xlarge',
    'max_instances': 10,
    'min_instances': 0,
    'name': 'testing_predictions_compute_environment'},
   'job_definition': {'image': 'registry.gitlab.com/airt.ai/airt-service:dev',
    'name': 'testing_predictions_job_definition'},
   'job_queue': {'name': 'testing_predictions_job_queue', 'priority': 100}},
  'preprocessing': {'compute_environment': {'instance_type': 'r5.16xlarge',
    'max_instances': 10,
    'min_instances': 0,
    'name': 'testing_preprocessing_compute_environment'},
   'job_definition':

In [None]:
#| export


def _create_batch_environment(input_yaml_path: str, output_yaml_path: str):
    """Create a batch environment based on the config specified and store the created environment ARN in the output YAML file

    Args:
        input_yaml_path: YAML config file path for creating the batch environment
        output_yaml_path: YAML file path to store the created environment ARN
    """
    with open(input_yaml_path) as f:
        d = yaml.safe_load(f)
    
    output: Dict[str, Dict[str, Dict[str, str]]] = dict()
    
    for region, config in d.items():
        output[region] = {}
        for task_name, value in config.items():
            sanitized_print(f"{task_name=}")

            compute_env = ComputeEnvironment.create(region=region, **value["compute_environment"])
            sanitized_print(f"{compute_env.arn=}")

            job_queue = compute_env.create_job_queue(**value["job_queue"])  # type: ignore
            sanitized_print(f"{job_queue.arn=}")

            job_definition = compute_env.create_job_definition(**value["job_definition"])  # type: ignore
            sanitized_print(f"{job_definition.arn=}")

            output[region][task_name] = dict(
                compute_environment_arn=compute_env.arn,
                job_queue_arn=job_queue.arn,
                job_definition_arn=job_definition.arn,
            )
    with open(output_yaml_path, "w") as f:
        yaml.dump(output, f, default_flow_style=False)

In [None]:
#| export


@call_parse
def create_batch_environment(
    input_yaml_path: Param("yaml_path", str), output_yaml_path: Param("yaml_path", str)  # type: ignore
):
    """Create a batch environment based on the config specified and store the created environment ARN in the output YAML file

    Args:
        input_yaml_path: YAML config file path for creating the batch environment
        output_yaml_path: YAML file path to store the created environment ARN
    """
    _create_batch_environment(
        input_yaml_path=input_yaml_path, output_yaml_path=output_yaml_path
    )

In [None]:
#| export


@contextmanager
def create_testing_batch_environment_ctx(input_yaml_path: str, output_yaml_path: str):
    """Create batch environment and tear it down after yield for testing

    Args:
        input_yaml_path: path of yaml file which has config to create batch environment
        output_yaml_path: path of yaml file to store created environment arn
    """
    _create_batch_environment(
        input_yaml_path=input_yaml_path, output_yaml_path=output_yaml_path
    )
    try:
        yield
    finally:
        with open(output_yaml_path) as f:
            d = yaml.safe_load(f)

        for region, config in d.items():
            for task_name, value in config.items():
                sanitized_print(f"deleting job definition - {task_name}")
                job_definition = JobDefinition.from_name_or_arn(value["job_definition_arn"], region)
                job_definition.delete()
                sanitized_print(f"deleting job queue - {task_name}")
                job_queue = JobQueue.from_name_or_arn(value["job_queue_arn"], region)
                job_queue.update(state="DISABLED")
                job_queue.wait(status="VALID", state="DISABLED")
                job_queue.delete()
                job_queue.wait(is_deleted=True)
                sanitized_print(f"deleting compute env - {task_name}")
                compute_env = ComputeEnvironment.from_name_or_arn(
                    value["compute_environment_arn"], region
                )
                compute_env.update(state="DISABLED")
                compute_env.wait(status="VALID", state="DISABLED")
                compute_env.delete()

In [None]:
# | eval: false

with tempfile.TemporaryDirectory() as td:
    td = Path(td)
    env_config_path = td / "env_config.yaml"
    created_env_info_path = td / "output_file.yaml"
    create_default_batch_environment_config(
        prefix="testing", output_path=env_config_path, regions=["eu-west-1", "us-west-1"]
    )

    with open(env_config_path) as f:
        env_config = yaml.safe_load(f)
    display(f"{env_config=}")
    with create_testing_batch_environment_ctx(env_config_path, created_env_info_path):
        assert created_env_info_path.exists()
        with open(created_env_info_path) as f:
            created_env_info = yaml.safe_load(f)
        display(f"{created_env_info=}")

"env_config={'eu-west-1': {'csv_processing': {'compute_environment': {'instance_type': 'r5.16xlarge', 'max_instances': 10, 'min_instances': 0, 'name': 'testing_csv_processing_compute_environment'}, 'job_definition': {'image': 'registry.gitlab.com/airt.ai/airt-service:dev', 'name': 'testing_csv_processing_job_definition'}, 'job_queue': {'name': 'testing_csv_processing_job_queue', 'priority': 100}}, 'predictions': {'compute_environment': {'instance_type': 'g4dn.xlarge', 'max_instances': 10, 'min_instances': 0, 'name': 'testing_predictions_compute_environment'}, 'job_definition': {'image': 'registry.gitlab.com/airt.ai/airt-service:dev', 'name': 'testing_predictions_job_definition'}, 'job_queue': {'name': 'testing_predictions_job_queue', 'priority': 100}}, 'preprocessing': {'compute_environment': {'instance_type': 'r5.16xlarge', 'max_instances': 10, 'min_instances': 0, 'name': 'testing_preprocessing_compute_environment'}, 'job_definition': {'image': 'registry.gitlab.com/airt.ai/airt-servic

task_name='csv_processing'
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/testing_csv_processing_compute_environment', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/testing_csv_processing_compute_environment', status=VALID, state=ENABLED
compute_env.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/testing_csv_processing_compute_environment'
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=CREATING, state=ENABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=VALID, state=ENABLED
job_queue.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue'
job_definition.arn='arn:aws:batch:eu-west-1:617504802562:job-definition/testing_csv_processing_job_definition:2'
task_name='predictio

"created_env_info={'eu-west-1': {'csv_processing': {'compute_environment_arn': 'arn:aws:batch:eu-west-1:617504802562:compute-environment/testing_csv_processing_compute_environment', 'job_definition_arn': 'arn:aws:batch:eu-west-1:617504802562:job-definition/testing_csv_processing_job_definition:2', 'job_queue_arn': 'arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue'}, 'predictions': {'compute_environment_arn': 'arn:aws:batch:eu-west-1:617504802562:compute-environment/testing_predictions_compute_environment', 'job_definition_arn': 'arn:aws:batch:eu-west-1:617504802562:job-definition/testing_predictions_job_definition:2', 'job_queue_arn': 'arn:aws:batch:eu-west-1:617504802562:job-queue/testing_predictions_job_queue'}, 'preprocessing': {'compute_environment_arn': 'arn:aws:batch:eu-west-1:617504802562:compute-environment/testing_preprocessing_compute_environment', 'job_definition_arn': 'arn:aws:batch:eu-west-1:617504802562:job-definition/testing_preprocessing_j

deleting job definition - csv_processing
deleting job queue - csv_processing
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=VALID, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.ar

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_csv_processing_job_queue' deleted
deleting compute env - csv_processing
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/testing_csv_processing_compute_environment', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:compute-environment/testing_csv_processing_compute_environment', status=VALID, state=DISABLED
deleting job definition - predictions
deleting job queue - predictions
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_predictions_job_queue', status=UPDATING, sta

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_preprocessing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_preprocessing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_preprocessing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_preprocessing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_preprocessing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_preprocessing_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_preprocessing_job_queu

[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_training_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_training_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_training_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_training_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_training_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_training_job_queue', status=DELETING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:eu-west-1:617504802562:job-queue/testing_training_job_queue', status=DELETING, state=DISABLED

[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-1:617504802562:job-queue/testing_training_job_queue' deleted
deleting compute env - training
[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-1:617504802562:compute-environment/testing_training_compute_environment', status=UPDATING, state=DISABLED
[INFO] __main__: wait(): self.arn='arn:aws:batch:us-west-1:617504802562:compute-environment/testing_training_compute_environment', status=VALID, state=DISABLED
