In [None]:
#| default_exp airflow.base_executor

In [None]:
#| export

from datetime import datetime, timedelta
from pathlib import Path
from typing import *

from airt_service.sanitizer import sanitized_print
from airt.executor.subcommand import (
    ModelExecutor,
    CLICommandBase,
)
from airt.helpers import slugify
from airt.logger import get_logger
from airt.patching import patch

from airt_service.airflow.utils import create_dag

22-10-20 06:45:11.408 [INFO] airt.executor.subcommand: Module loaded.


In [None]:
import tempfile
from time import sleep

from airt.executor.subcommand import ClassCLICommand, SimpleCLICommand
from airt.testing import activate_by_import
from airt_service.airflow.utils import trigger_dag, unpause_dag, wait_for_run_to_complete, list_dag_runs

[INFO] airt.testing.activate_by_import: Testing environment activated.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
#| exporti

logger = get_logger(__name__)

In [None]:
logger.info("Module loaded.")

[INFO] __main__: Module loaded.


In [None]:
def setup_test_paths(d: str) -> Tuple[str, str]:
    d = Path(d)
    paths = [d / sd for sd in ["data", "model"]]
    display(f"{paths=}")

    # create tmp dirs for data and model
    for p in paths:
        p.mkdir(parents=True, exist_ok=True)

    # RemotePaths: data_path is "read-only", while model_path can be used for both reading and writing between calls
    return tuple(f"local:{p}" for p in paths)


with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)

data_path_url, model_path_url

"paths=[PosixPath('/tmp/tmpvz84nj5l/data'), PosixPath('/tmp/tmpvz84nj5l/model')]"

('local:/tmp/tmpvz84nj5l/data', 'local:/tmp/tmpvz84nj5l/model')

In [None]:
#| export


class BaseAirflowExecutor(ModelExecutor):
    def _create_step_template(self, step: CLICommandBase, **kwargs):
        """Create template for step

        Args:
            step: step to create template
            kwargs: keyword arguments for step
        Returns:
            Template for step
        """
        raise NotImplementedError("Need to implement")

    def _create_dag_template(
        self,
        on_step_start: Optional[CLICommandBase] = None,
        on_step_end: Optional[CLICommandBase] = None,
        **kwargs,
    ) -> str:
        """
        Create DAG template with steps as tasks

        Args:
            on_step_start: CLI to call before executing step/task in DAG
            on_step_end: CLI to call after executing step/task in DAG
            kwargs: keyword arguments to pass to steps' CLI
        Returns:
            Generated DAG with steps as tasks
        """
        raise NotImplementedError("Need to implement")

    def schedule(
        self,
        *,
        schedule_interval: Optional[Union[str, timedelta]] = None,
        description: str,
        tags: Union[str, List[str]],
        on_step_start: Optional[CLICommandBase] = None,
        on_step_end: Optional[CLICommandBase] = None,
        **kwargs,
    ) -> Path:
        """Create scheduled DAG in airflow

        Args:
            schedule_interval: schedule interval of DAG as string or timedelta object
            description: description of DAG
            tags: tags for DAG
            on_step_start: CLI to call before executing step/task in DAG
            on_step_end: CLI to call after executing step/task in DAG
            kwargs: keyword arguments needed for steps/tasks
        Returns:
            Path in which dag file is stored
        """
        raise NotImplementedError("Need to implement")

    def execute(
        self,
        *,
        description: str,
        tags: Union[str, List[str]],
        on_step_start: Optional[CLICommandBase] = None,
        on_step_end: Optional[CLICommandBase] = None,
        **kwargs,
    ) -> Tuple[Path, str]:
        """Create DAG and execute steps in airflow

        Args:
            description: description of DAG
            tags: tags for DAG
            on_step_start: CLI to call before executing step/task in DAG
            on_step_end: CLI to call after executing step/task in DAG
            kwargs: keyword arguments needed for steps/tasks
        Returns:
            A tuple which contains dag file path and run id
        """
        raise NotImplementedError("Need to implement")

In [None]:
#| export

dag_template = """import datetime
from textwrap import dedent

# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG

# Operators; we need this to operate!
from airflow.providers.amazon.aws.operators.batch import BatchOperator
import azure.batch.models as batchmodels
from airflow.providers.microsoft.azure.operators.batch import AzureBatchOperator
from airflow.operators.bash import BashOperator
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
with DAG(
    '{dag_name}',
    # These args will get passed on to each operator
    # You can override them on a per-task basis during operator initialization
    default_args={{
        'schedule_interval': {schedule_interval},
        'depends_on_past': False,
        'email': ['info@airt.ai'],
        'email_on_failure': False,
        'email_on_retry': False,
        'retries': 1,
        'retry_delay': datetime.timedelta(minutes=5),
        # 'queue': 'queue',
        # 'pool': 'backfill',
        # 'priority_weight': 10,
        # 'end_date': datetime.datetime(2016, 1, 1),
        # 'wait_for_downstream': False,
        # 'sla': datetime.timedelta(hours=2),
        # 'execution_timeout': datetime.timedelta(seconds=300),
        # 'on_failure_callback': some_function,
        # 'on_success_callback': some_other_function,
        # 'on_retry_callback': another_function,
        # 'sla_miss_callback': yet_another_function,
        # 'trigger_rule': 'all_success'
    }},
    description='{description}',
    start_date={start_date},
    catchup=False,
    tags={tags},
    is_paused_upon_creation=False,
) as dag:

    # t1, t2 and t3 are examples of tasks created by instantiating operators
"""

In [None]:
sanitized_print(
    dag_template.format(
        dag_name="random",
        schedule_interval="'@daily'",
        start_date=datetime.utcnow().today().__repr__(),
        description="test description",
        tags=(["test_tag"]).__repr__(),
    )
)

import datetime
from textwrap import dedent

# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG

# Operators; we need this to operate!
from airflow.providers.amazon.aws.operators.batch import BatchOperator
import azure.batch.models as batchmodels
from airflow.providers.microsoft.azure.operators.batch import AzureBatchOperator
from airflow.operators.bash import BashOperator
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
with DAG(
    'random',
    # These args will get passed on to each operator
    # You can override them on a per-task basis during operator initialization
    default_args={
        'schedule_interval': '@daily',
        'depends_on_past': False,
        'email': ['info@airt.ai'],
        'email_on_failure': False,
        'email_on_retry': False,
        'retries': 1,
        'retry_delay': datetime.timedelta(minutes=5),
        # 'queue': 'queue',
        # 'pool': 'backfill',
        # 'priority_weight': 10,
        # '

In [None]:
#| export


@patch
def _create_dag_id(self: BaseAirflowExecutor, **kwargs) -> str:
    """
    Create dag id by combining steps CLIs and their arguments

    Args:
        kwargs: keyword arguments needed by steps
    Returns:
        Created dag id
    """
    return slugify("_".join([step.to_cli(**kwargs) for step in self.steps]))

In [None]:
steps = [
    ClassCLICommand(
        executor_name="test-executor", class_name="MyTestExecutor", f_name="f"
    ),
    ClassCLICommand(
        executor_name="test-executor", class_name="MyTestExecutor", f_name="g"
    ),
]

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)
    expected = slugify(
        steps[0].to_cli(data_path_url=data_path_url, model_path_url=model_path_url)
        + "_"
        + steps[1].to_cli(data_path_url=data_path_url, model_path_url=model_path_url)
    )
    abe = BaseAirflowExecutor(
        steps=steps,
    )
    actual = abe._create_dag_id(
        data_path_url=data_path_url, model_path_url=model_path_url
    )
    display(actual)
    assert actual == expected

"paths=[PosixPath('/tmp/tmpo5g49vy6/data'), PosixPath('/tmp/tmpo5g49vy6/model')]"

'test-executor-my_test_executor-f-data-path-urllocaltmptmpo5g49vy6data-model-path-urllocaltmptmpo5g49vy6model_test-executor-my_test_executor-g-data-path-urllocaltmptmpo5g49vy6data-model-path-urllocaltmptmpo5g49vy6model'

In [None]:
#| export


@patch
def _create_jinja2_template_kwargs(
    self: BaseAirflowExecutor, **kwargs
) -> Dict[str, Any]:
    """
    Convert kwargs into jinja2 compatible template kwargs

    Args:
        kwargs: keyword arguments to convert
    Returns:
        A dict of jinja2 template formatted kwargs
    """
    formatted_kwargs = {}
    for key, value in kwargs.items():
        formatted_kwargs[key] = (
            "{{{{ dag_run.conf['"
            + key
            + "'] if '"
            + key
            + "' in dag_run.conf else "
            + value.__repr__()
            + " }}}}"
        )
    return formatted_kwargs

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)

    expected = {
        "data_path_url": "{{{{ dag_run.conf['data_path_url'] if 'data_path_url' in dag_run.conf else '"
        + data_path_url
        + "' }}}}",
        "model_path_url": "{{{{ dag_run.conf['model_path_url'] if 'model_path_url' in dag_run.conf else '"
        + model_path_url
        + "' }}}}",
    }

    actual = abe._create_jinja2_template_kwargs(
        data_path_url=data_path_url, model_path_url=model_path_url
    )
    display(actual)
    assert actual == expected

"paths=[PosixPath('/tmp/tmp4jo015lo/data'), PosixPath('/tmp/tmp4jo015lo/model')]"

{'data_path_url': "{{{{ dag_run.conf['data_path_url'] if 'data_path_url' in dag_run.conf else 'local:/tmp/tmp4jo015lo/data' }}}}",
 'model_path_url': "{{{{ dag_run.conf['model_path_url'] if 'model_path_url' in dag_run.conf else 'local:/tmp/tmp4jo015lo/model' }}}}"}

In [None]:
# DO NOT ADD EXPORT - This patch cell is being used for testing purpose
# Actual _create_step_template should be implemented by child class


@patch
def _create_step_template(self: BaseAirflowExecutor, step: CLICommandBase, **kwargs):
    """
    Create template for step

    Args:
        step: step to create template
        kwargs: keyword arguments for step
    Returns:
        Template for step
    """
    triple_quote = "'''"
    formatted_kwargs = self._create_jinja2_template_kwargs(**kwargs)

    cli_command = step.to_cli(**formatted_kwargs)
    task_id = slugify(step.to_cli(**kwargs))

    task = f"""BashOperator(task_id="{task_id}", bash_command={triple_quote}{cli_command}{triple_quote})"""
    return task

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)
    actual = abe._create_step_template(
        steps[0], data_path_url=data_path_url, model_path_url=model_path_url
    )
    display(actual)

"paths=[PosixPath('/tmp/tmp1ucduhf_/data'), PosixPath('/tmp/tmp1ucduhf_/model')]"

'BashOperator(task_id="test-executor-my_test_executor-f-data-path-urllocaltmptmp1ucduhf_data-model-path-urllocaltmptmp1ucduhf_model", bash_command=\'\'\'test-executor my_test_executor f --data-path-url={{{{ dag_run.conf[\'data_path_url\'] if \'data_path_url\' in dag_run.conf else \'local:/tmp/tmp1ucduhf_/data\' }}}} --model-path-url={{{{ dag_run.conf[\'model_path_url\'] if \'model_path_url\' in dag_run.conf else \'local:/tmp/tmp1ucduhf_/model\' }}}}\'\'\')'

In [None]:
# DO NOT ADD EXPORT - This patch cell is being used for testing purpose
# Actual _create_step_template should be implemented by child class


@patch
def _create_dag_template(
    self: BaseAirflowExecutor,
    on_step_start: Optional[CLICommandBase] = None,
    on_step_end: Optional[CLICommandBase] = None,
    **kwargs,
) -> str:
    """
    Create DAG template with steps as tasks

    Args:
        on_step_start: CLI to call before executing step/task in DAG
        on_step_end: CLI to call after executing step/task in DAG
        kwargs: keyword arguments to pass to steps' CLI
    Returns:
        Generated DAG with steps as tasks
    """
    curr_dag_template = dag_template

    downstream_tasks = ""
    newline = "\n"
    tab = " " * 4

    existing_tasks = 0
    for i, step in enumerate(self.steps):
        if on_step_start is not None:
            curr_dag_template += f"""{newline}{tab}t{existing_tasks+1} = {self._create_step_template(on_step_start, step_count=i+1, **kwargs)}"""
            existing_tasks += 1

        curr_dag_template += f"""{newline}{tab}t{existing_tasks+1} = {self._create_step_template(step, **kwargs)}"""
        existing_tasks += 1

        if on_step_end is not None:
            curr_dag_template += f"""{newline}{tab}t{existing_tasks+1} = {self._create_step_template(on_step_end, step_count=i+1, **kwargs)}"""
            existing_tasks += 1

    downstream_tasks = f"{newline}{tab}" + " >> ".join(
        [f"t{i}" for i in range(1, existing_tasks + 1)]
    )
    curr_dag_template += downstream_tasks

    return curr_dag_template

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)

    kwargs = {"data_path_url": data_path_url, "model_path_url": model_path_url}

    abe = BaseAirflowExecutor(steps=steps)

    on_step_start = SimpleCLICommand(command="sleep {step_count}")
    on_step_end = SimpleCLICommand(command="echo step {step_count} completed")
    sanitized_print(
        abe._create_dag_template(
            on_step_start=on_step_start, on_step_end=on_step_end, **kwargs
        )
    )

"paths=[PosixPath('/tmp/tmpx_vn59vn/data'), PosixPath('/tmp/tmpx_vn59vn/model')]"

import datetime
from textwrap import dedent

# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG

# Operators; we need this to operate!
from airflow.providers.amazon.aws.operators.batch import BatchOperator
import azure.batch.models as batchmodels
from airflow.providers.microsoft.azure.operators.batch import AzureBatchOperator
from airflow.operators.bash import BashOperator
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
with DAG(
    '{dag_name}',
    # These args will get passed on to each operator
    # You can override them on a per-task basis during operator initialization
    default_args={{
        'schedule_interval': {schedule_interval},
        'depends_on_past': False,
        'email': ['info@airt.ai'],
        'email_on_failure': False,
        'email_on_retry': False,
        'retries': 1,
        'retry_delay': datetime.timedelta(minutes=5),
        # 'queue': 'queue',
        # 'pool': 'backfill',
        # 'priority_weight':

In [None]:
#| export


@patch
def _create_dag(
    self: BaseAirflowExecutor,
    *,
    schedule_interval: Optional[str] = None,
    description: str,
    tags: Union[str, List[str]],
    on_step_start: Optional[CLICommandBase] = None,
    on_step_end: Optional[CLICommandBase] = None,
    **kwargs,
) -> Tuple[str, Path]:
    """Create DAG in airflow

    Args:
        schedule_interval: schedule interval of DAG as string
        description: description of DAG
        tags: tags for DAG
        on_step_start: CLI to call before executing step/task in DAG
        on_step_end: CLI to call after executing step/task in DAG
        kwargs: keyword arguments needed for steps/tasks
    Returns:
        A tuple of dag id and dag file path
    """
    if isinstance(tags, str):
        tags = [tags]

    curr_dag_template = self._create_dag_template(
        on_step_start=on_step_start, on_step_end=on_step_end, **kwargs
    )
    dag_id = self._create_dag_id(**kwargs)
    dag_file_path = create_dag(
        dag_id=dag_id,
        dag_definition_template=curr_dag_template,
        schedule_interval=schedule_interval,
        start_date=datetime.utcnow().today().__repr__(),
        description=description,
        tags=tags.__repr__(),
    )

    return dag_id, dag_file_path

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)
    steps = [
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="f"
        ),
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="g"
        ),
    ]
    on_step_start = SimpleCLICommand(command="sleep {step_count}")
    on_step_end = SimpleCLICommand(command="echo step {step_count} completed")

    abe = BaseAirflowExecutor(steps=steps)
    dag_id, dag_file_path = abe._create_dag(
        data_path_url=data_path_url,
        model_path_url=model_path_url,
        #         schedule_interval="@weekly",
        schedule_interval=None,
        description="test description",
        tags=["test_tag"],
        on_step_start=on_step_start,
        on_step_end=on_step_end,
    )

    display(f"{dag_file_path=}")
    dag_id = str(dag_file_path).split("/")[-1].split(".py")[0]

    sleep(15)

    dag_runs = list_dag_runs(dag_id=dag_id)
    display(f"{dag_runs=}")

    run_id = trigger_dag(dag_id=dag_id, conf={})

    #     run_id = dag_runs[0]["run_id"]
    display(run_id)
    state = wait_for_run_to_complete(dag_id=dag_id, run_id=run_id, timeout=600)
    display(state)
    dag_file_path.unlink()

"paths=[PosixPath('/tmp/tmp3m4x5s7t/data'), PosixPath('/tmp/tmp3m4x5s7t/model')]"

"dag_file_path=PosixPath('/root/airflow/dags/test-executor-my_test_executor-f-data-path-urllocaltmptmp3m4x5s7tdata-model-path-urllocaltmptmp3m4x5s7tmodel_test-executor-my_test_executor-g-data-path-urllocaltmptmp3m4x5s7tdata-model-path-urllocaltmptmp3m4x5s7tmodel.py')"

'dag_runs=[]'

[{'dag_id': 'test-executor-my_test_executor-f-data-path-urllocaltmptmp3m4x5s7tdata-model-path-urllocaltmptmp3m4x5s7tmodel_test-executor-my_test_executor-g-data-path-urllocaltmptmp3m4x5s7tdata-model-path-urllocaltmptmp3m4x5s7tmodel', 'run_id': 'airt-service__2022-10-20T06:45:31.469563', 'state': 'running', 'execution_date': '2022-10-20T06:45:32+00:00', 'start_date': '2022-10-20T06:45:33.033998+00:00', 'end_date': ''}]


'airt-service__2022-10-20T06:45:31.469563'

'success'

In [None]:
#| export


@patch
def schedule(
    self: BaseAirflowExecutor,
    *,
    schedule_interval: Optional[Union[str, timedelta]] = None,
    description: str,
    tags: Union[str, List[str]],
    on_step_start: Optional[CLICommandBase] = None,
    on_step_end: Optional[CLICommandBase] = None,
    **kwargs,
) -> Path:
    """Create scheduled DAG in airflow

    Args:
        schedule_interval: schedule interval of DAG as string or timedelta object
        description: description of DAG
        tags: tags for DAG
        on_step_start: CLI to call before executing step/task in DAG
        on_step_end: CLI to call after executing step/task in DAG
        kwargs: keyword arguments needed for steps/tasks
    Returns:
        Path in which dag file is stored
    """
    schedule_interval = (
        f"'{schedule_interval}'"
        if isinstance(schedule_interval, str)
        else schedule_interval.__repr__()
    )
    dag_id, dag_file_path = self._create_dag(
        schedule_interval=schedule_interval,
        description=description,
        tags=tags,
        on_step_start=on_step_start,
        on_step_end=on_step_end,
        **kwargs,
    )

    return dag_file_path

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)
    steps = [
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="f"
        ),
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="g"
        ),
    ]
    on_step_start = SimpleCLICommand(command="sleep {step_count}")
    on_step_end = SimpleCLICommand(command="echo step {step_count} completed")

    abe = BaseAirflowExecutor(steps=steps)
    dag_file_path = abe.schedule(
        data_path_url=data_path_url,
        model_path_url=model_path_url,
        #         schedule_interval="@weekly",
        schedule_interval=timedelta(days=7),
        description="test description",
        tags="test_tag",
        on_step_start=on_step_start,
        on_step_end=on_step_end,
    )

    display(f"{dag_file_path=}")
    dag_id = str(dag_file_path).split("/")[-1].split(".py")[0]

    sleep(15)

    dag_runs = list_dag_runs(dag_id=dag_id)
    display(f"{dag_runs=}")

    run_id = trigger_dag(dag_id=dag_id, conf={})

    #     run_id = dag_runs[0]["run_id"]
    display(run_id)
    state = wait_for_run_to_complete(dag_id=dag_id, run_id=run_id, timeout=600)
    display(state)
    dag_file_path.unlink()

"paths=[PosixPath('/tmp/tmpw8xwrxp1/data'), PosixPath('/tmp/tmpw8xwrxp1/model')]"

"dag_file_path=PosixPath('/root/airflow/dags/test-executor-my_test_executor-f-data-path-urllocaltmptmpw8xwrxp1data-model-path-urllocaltmptmpw8xwrxp1model_test-executor-my_test_executor-g-data-path-urllocaltmptmpw8xwrxp1data-model-path-urllocaltmptmpw8xwrxp1model.py')"

'dag_runs=[]'

[{'dag_id': 'test-executor-my_test_executor-f-data-path-urllocaltmptmpw8xwrxp1data-model-path-urllocaltmptmpw8xwrxp1model_test-executor-my_test_executor-g-data-path-urllocaltmptmpw8xwrxp1data-model-path-urllocaltmptmpw8xwrxp1model', 'run_id': 'airt-service__2022-10-20T06:46:06.626565', 'state': 'running', 'execution_date': '2022-10-20T06:46:07+00:00', 'start_date': '2022-10-20T06:46:08.520235+00:00', 'end_date': ''}]


'airt-service__2022-10-20T06:46:06.626565'

'success'