In [None]:
#| default_exp airflow.bash_executor

In [None]:
#| export


from pathlib import Path
from typing import *

from airt.executor.subcommand import CLICommandBase
from airt.helpers import slugify
from airt.logger import get_logger
from airt.patching import patch

from airt_service.airflow.base_executor import BaseAirflowExecutor, dag_template
from airt_service.airflow.utils import trigger_dag

22-09-13 11:19:34.246 [INFO] airt.executor.subcommand: Module loaded.


In [None]:
import os
import tempfile
from datetime import timedelta
from time import sleep

from sqlmodel import select

from airt.executor.subcommand import SimpleCLICommand, ClassCLICommand
from airt.testing import activate_by_import
from airt_service.airflow.utils import wait_for_run_to_complete, list_dag_runs
from airt_service.data.utils import create_db_uri_for_s3_datablob
from airt_service.db.models import (
    User,
    create_user_for_testing,
    get_session,
    get_session_with_context,
    DataBlob,
    DataSource,
)
from airt_service.helpers import commit_or_rollback

[INFO] airt.testing.activate_by_import: Testing environment activated.
[INFO] airt.keras.helpers: Using a single GPU #0 with memory_limit 1024 MB


In [None]:
test_username = create_user_for_testing(subscription_type="small")
display(test_username)

'jnmqrtsece'

In [None]:
#| exporti

logger = get_logger(__name__)

In [None]:
logger.info("Module loaded.")

[INFO] __main__: Module loaded.


In [None]:
def setup_test_paths(d: str) -> Tuple[str, str]:
    d = Path(d)
    paths = [d / sd for sd in ["data", "model"]]
    print(f"{paths=}")

    # create tmp dirs for data and model
    for p in paths:
        p.mkdir(parents=True, exist_ok=True)

    # RemotePaths: data_path is "read-only", while model_path can be used for both reading and writing between calls
    return tuple(f"local:{p}" for p in paths)


with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)

data_path_url, model_path_url

paths=[Path('/tmp/tmpkuje5ha9/data'), Path('/tmp/tmpkuje5ha9/model')]


('local:/tmp/tmpkuje5ha9/data', 'local:/tmp/tmpkuje5ha9/model')

In [None]:
#| export


class AirflowBashExecutor(BaseAirflowExecutor):
    def execute(
        self,
        *,
        description: str,
        tags: Union[str, List[str]],
        on_step_start: Optional[CLICommandBase] = None,
        on_step_end: Optional[CLICommandBase] = None,
        **kwargs
    ) -> Tuple[Path, str]:
        """Create DAG and execute steps in airflow

        Args:
            description: description of DAG
            tags: tags for DAG
            on_step_start: CLI to call before executing step/task in DAG
            on_step_end: CLI to call after executing step/task in DAG
            kwargs: keyword arguments needed for steps/tasks
        Returns:
            A tuple which contains dag file path and run id
        """
        raise NotImplementedError("Need to implement")

In [None]:
#| export


@patch
def _create_step_template(self: AirflowBashExecutor, step: CLICommandBase, **kwargs):
    """
    Create template for step

    Args:
        step: step to create template
        kwargs: keyword arguments for step
    Returns:
        Template for step
    """
    triple_quote = "'''"
    formatted_kwargs = self._create_jinja2_template_kwargs(**kwargs)

    cli_command = step.to_cli(**formatted_kwargs)
    task_id = slugify(step.to_cli(**kwargs))

    task = f"""BashOperator(task_id="{task_id}", bash_command={triple_quote}{cli_command}{triple_quote})"""
    return task

In [None]:
steps = [
    ClassCLICommand(
        executor_name="test-executor", class_name="MyTestExecutor", f_name="f"
    ),
    ClassCLICommand(
        executor_name="test-executor", class_name="MyTestExecutor", f_name="g"
    ),
]

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)
    abe = AirflowBashExecutor(
        steps=steps,
    )
    actual = abe._create_step_template(
        steps[0], data_path_url=data_path_url, model_path_url=model_path_url
    )
    display(actual)

paths=[Path('/tmp/tmp6vfu5r5b/data'), Path('/tmp/tmp6vfu5r5b/model')]


'BashOperator(task_id="test-executor-my_test_executor-f-data-path-urllocaltmptmp6vfu5r5bdata-model-path-urllocaltmptmp6vfu5r5bmodel", bash_command=\'\'\'test-executor my_test_executor f --data-path-url={{{{ dag_run.conf[\'data_path_url\'] if \'data_path_url\' in dag_run.conf else \'local:/tmp/tmp6vfu5r5b/data\' }}}} --model-path-url={{{{ dag_run.conf[\'model_path_url\'] if \'model_path_url\' in dag_run.conf else \'local:/tmp/tmp6vfu5r5b/model\' }}}}\'\'\')'

In [None]:
#| export


@patch
def _create_dag_template(
    self: BaseAirflowExecutor,
    on_step_start: Optional[CLICommandBase] = None,
    on_step_end: Optional[CLICommandBase] = None,
    **kwargs,
) -> str:
    """
    Create DAG template with steps as tasks

    Args:
        on_step_start: CLI to call before executing step/task in DAG
        on_step_end: CLI to call after executing step/task in DAG
        kwargs: keyword arguments to pass to steps' CLI
    Returns:
        Generated DAG with steps as tasks
    """
    curr_dag_template = dag_template

    downstream_tasks = ""
    newline = "\n"
    tab = " " * 4

    existing_tasks = 0
    for i, step in enumerate(self.steps):
        if on_step_start is not None:
            curr_dag_template += f"""{newline}{tab}t{existing_tasks+1} = {self._create_step_template(on_step_start, step_count=i+1, **kwargs)}"""
            existing_tasks += 1

        curr_dag_template += f"""{newline}{tab}t{existing_tasks+1} = {self._create_step_template(step, **kwargs)}"""
        existing_tasks += 1

        if on_step_end is not None:
            curr_dag_template += f"""{newline}{tab}t{existing_tasks+1} = {self._create_step_template(on_step_end, step_count=i+1, **kwargs)}"""
            existing_tasks += 1

    downstream_tasks = f"{newline}{tab}" + " >> ".join(
        [f"t{i}" for i in range(1, existing_tasks + 1)]
    )
    curr_dag_template += downstream_tasks

    return curr_dag_template

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)

    kwargs = {"data_path_url": data_path_url, "model_path_url": model_path_url}

    abe = AirflowBashExecutor(steps=steps)

    on_step_start = SimpleCLICommand(command="sleep {step_count}")
    on_step_end = SimpleCLICommand(command="echo step {step_count} completed")
    print(
        abe._create_dag_template(
            on_step_start=on_step_start, on_step_end=on_step_end, **kwargs
        )
    )

paths=[Path('/tmp/tmpvv7l9r_3/data'), Path('/tmp/tmpvv7l9r_3/model')]
import datetime
from textwrap import dedent

# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG

# Operators; we need this to operate!
from airflow.providers.amazon.aws.operators.batch import BatchOperator
from airflow.operators.bash import BashOperator
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
with DAG(
    '{dag_name}',
    # These args will get passed on to each operator
    # You can override them on a per-task basis during operator initialization
    default_args={{
        'schedule_interval': {schedule_interval},
        'depends_on_past': False,
        'email': ['info@airt.ai'],
        'email_on_failure': False,
        'email_on_retry': False,
        'retries': 1,
        'retry_delay': datetime.timedelta(minutes=5),
        # 'queue': 'queue',
        # 'pool': 'backfill',
        # 'priority_weight': 10,
        # 'end_date': datetime.datetime(2016, 1

In [None]:
# Test case for AirflowBashExecutor._create_dag

with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)
    steps = [
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="f"
        ),
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="g"
        ),
    ]
    on_step_start = SimpleCLICommand(command="sleep {step_count}")
    on_step_end = SimpleCLICommand(command="echo step {step_count} completed")

    abe = AirflowBashExecutor(steps=steps)
    dag_id, dag_file_path = abe._create_dag(
        data_path_url=data_path_url,
        model_path_url=model_path_url,
        #         schedule_interval="@weekly",
        schedule_interval=None,
        description="test description",
        tags=["test_tag"],
        on_step_start=on_step_start,
        on_step_end=on_step_end,
    )

    display(f"{dag_file_path=}")
    dag_id = str(dag_file_path).split("/")[-1].split(".py")[0]

    sleep(15)

    dag_runs = list_dag_runs(dag_id=dag_id)
    display(f"{dag_runs=}")

    run_id = trigger_dag(dag_id=dag_id, conf={})

    #     run_id = dag_runs[0]["run_id"]
    display(run_id)
    state = wait_for_run_to_complete(dag_id=dag_id, run_id=run_id, timeout=600)
    display(state)
    dag_file_path.unlink()

paths=[Path('/tmp/tmp4n319w1w/data'), Path('/tmp/tmp4n319w1w/model')]


"dag_file_path=Path('/root/airflow/dags/test-executor-my_test_executor-f-data-path-urllocaltmptmp4n319w1wdata-model-path-urllocaltmptmp4n319w1wmodel_test-executor-my_test_executor-g-data-path-urllocaltmptmp4n319w1wdata-model-path-urllocaltmptmp4n319w1wmodel.py')"

'dag_runs=[]'

[{'dag_id': 'test-executor-my_test_executor-f-data-path-urllocaltmptmp4n319w1wdata-model-path-urllocaltmptmp4n319w1wmodel_test-executor-my_test_executor-g-data-path-urllocaltmptmp4n319w1wdata-model-path-urllocaltmptmp4n319w1wmodel', 'run_id': 'airt-service__2022-09-13T11:19:55.443395', 'state': 'running', 'execution_date': '2022-09-13T11:19:56+00:00', 'start_date': '2022-09-13T11:19:56.577585+00:00', 'end_date': ''}]


'airt-service__2022-09-13T11:19:55.443395'

'success'

In [None]:
# Test case for AirflowBashExecutor.schedule

with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)
    steps = [
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="f"
        ),
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="g"
        ),
    ]
    on_step_start = SimpleCLICommand(command="sleep {step_count}")
    on_step_end = SimpleCLICommand(command="echo step {step_count} completed")

    abe = AirflowBashExecutor(steps=steps)
    dag_file_path = abe.schedule(
        data_path_url=data_path_url,
        model_path_url=model_path_url,
        #         schedule_interval="@weekly",
        schedule_interval=timedelta(days=7),
        description="test description",
        tags="test_tag",
        on_step_start=on_step_start,
        on_step_end=on_step_end,
    )

    display(f"{dag_file_path=}")
    dag_id = str(dag_file_path).split("/")[-1].split(".py")[0]

    sleep(15)

    dag_runs = list_dag_runs(dag_id=dag_id)
    display(f"{dag_runs=}")

    run_id = trigger_dag(dag_id=dag_id, conf={})

    #     run_id = dag_runs[0]["run_id"]
    display(run_id)
    state = wait_for_run_to_complete(dag_id=dag_id, run_id=run_id, timeout=600)
    display(state)
    dag_file_path.unlink()

paths=[Path('/tmp/tmp30zowdz5/data'), Path('/tmp/tmp30zowdz5/model')]


"dag_file_path=Path('/root/airflow/dags/test-executor-my_test_executor-f-data-path-urllocaltmptmp30zowdz5data-model-path-urllocaltmptmp30zowdz5model_test-executor-my_test_executor-g-data-path-urllocaltmptmp30zowdz5data-model-path-urllocaltmptmp30zowdz5model.py')"

'dag_runs=[]'

[{'dag_id': 'test-executor-my_test_executor-f-data-path-urllocaltmptmp30zowdz5data-model-path-urllocaltmptmp30zowdz5model_test-executor-my_test_executor-g-data-path-urllocaltmptmp30zowdz5data-model-path-urllocaltmptmp30zowdz5model', 'run_id': 'airt-service__2022-09-13T11:20:31.164363', 'state': 'running', 'execution_date': '2022-09-13T11:20:32+00:00', 'start_date': '2022-09-13T11:20:32.171002+00:00', 'end_date': ''}]


'airt-service__2022-09-13T11:20:31.164363'

'success'

In [None]:
#| export


@patch
def execute(
    self: AirflowBashExecutor,
    *,
    description: str,
    tags: Union[str, List[str]],
    on_step_start: Optional[CLICommandBase] = None,
    on_step_end: Optional[CLICommandBase] = None,
    **kwargs
) -> Tuple[Path, str]:
    """Create DAG and execute steps in airflow

    Args:
        description: description of DAG
        tags: tags for DAG
        on_step_start: CLI to call before executing step/task in DAG
        on_step_end: CLI to call after executing step/task in DAG
        kwargs: keyword arguments needed for steps/tasks
    Returns:
        A tuple which contains dag file path and run id
    """
    schedule_interval = None
    dag_id, dag_file_path = self._create_dag(
        schedule_interval=schedule_interval,
        description=description,
        tags=tags,
        on_step_start=on_step_start,
        on_step_end=on_step_end,
        **kwargs
    )

    conf = {key: value for key, value in kwargs.items()}
    run_id = trigger_dag(dag_id=dag_id, conf=conf)
    return dag_file_path, run_id

In [None]:
with tempfile.TemporaryDirectory() as d:
    data_path_url, model_path_url = setup_test_paths(d)

    steps = [
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="f"
        ),
        ClassCLICommand(
            executor_name="test-executor", class_name="MyTestExecutor", f_name="g"
        ),
    ]
    on_step_start = SimpleCLICommand(command="sleep {step_count}")
    on_step_end = SimpleCLICommand(command="echo step {step_count} completed")

    abe = AirflowBashExecutor(
        steps=steps,
    )

    dag_file_path, run_id = abe.execute(
        description="test description",
        tags="test_tag",
        on_step_start=on_step_start,
        on_step_end=on_step_end,
        data_path_url=data_path_url,
        model_path_url=model_path_url,
    )
    display(dag_file_path)
    display(run_id)

    dag_id = str(dag_file_path).split("/")[-1].split(".py")[0]
    state = wait_for_run_to_complete(dag_id=dag_id, run_id=run_id, timeout=600)
    display(state)
    dag_file_path.unlink()

paths=[Path('/tmp/tmpyzt4fehf/data'), Path('/tmp/tmpyzt4fehf/model')]
[{'dag_id': 'test-executor-my_test_executor-f-data-path-urllocaltmptmpyzt4fehfdata-model-path-urllocaltmptmpyzt4fehfmodel_test-executor-my_test_executor-g-data-path-urllocaltmptmpyzt4fehfdata-model-path-urllocaltmptmpyzt4fehfmodel', 'run_id': 'airt-service__2022-09-13T11:20:51.257666', 'state': 'running', 'execution_date': '2022-09-13T11:20:52+00:00', 'start_date': '2022-09-13T11:20:52.888510+00:00', 'end_date': ''}]


Path('/root/airflow/dags/test-executor-my_test_executor-f-data-path-urllocaltmptmpyzt4fehfdata-model-path-urllocaltmptmpyzt4fehfmodel_test-executor-my_test_executor-g-data-path-urllocaltmptmpyzt4fehfdata-model-path-urllocaltmptmpyzt4fehfmodel.py')

'airt-service__2022-09-13T11:20:51.257666'

'success'

In [None]:
with get_session_with_context() as session:
    user = session.exec(select(User).where(User.username == test_username)).one()
    uri = "s3://test-airt-service/account_312571_events"
    datablob = DataBlob(
        type="s3",
        uri=create_db_uri_for_s3_datablob(
            uri=uri,
            access_key=os.environ["AWS_ACCESS_KEY_ID"],
            secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        ),
        source=uri,
        cloud_provider="aws",
        region="eu-west-1",
        total_steps=1,
        user=user,
    )
    with commit_or_rollback(session):
        session.add(datablob)
    display(datablob)
    datablob_command = "s3_pull {datablob_id}"
    display(datablob_command)

    steps = [
        SimpleCLICommand(command=datablob_command),
    ]
    on_step_start = SimpleCLICommand(command="sleep {step_count}")
    on_step_end = SimpleCLICommand(command="echo step {step_count} completed")

    abe = AirflowBashExecutor(
        steps=steps,
    )

    dag_file_path, run_id = abe.execute(
        description="test description",
        tags=["test_tag"],
        on_step_start=on_step_start,
        on_step_end=on_step_end,
        datablob_id=datablob.id,
    )
    display(dag_file_path)
    display(run_id)

    dag_id = str(dag_file_path).split("/")[-1].split(".py")[0]
    state = wait_for_run_to_complete(dag_id=dag_id, run_id=run_id, timeout=600)
    display(state)
    dag_file_path.unlink()

DataBlob(id=179, uuid=UUID('c611001b-c8b4-49bf-9c52-a647a3de8bca'), type='s3', uri='s3://AKIAY7RRHQ4BEOUZVSE3:8VUSagSJGSMO9cQVpqWM6NJ9THoD8wtTC7EMRF+9@test-airt-service/account_312571_events', source='s3://test-airt-service/account_312571_events', total_steps=1, completed_steps=0, folder_size=None, cloud_provider=<CloudProvider.aws: 'aws'>, region='eu-west-1', error=None, disabled=False, path=None, created=datetime.datetime(2022, 9, 13, 11, 21, 36), user_id=137, pulled_on=None, tags=[])

's3_pull {datablob_id}'

[{'dag_id': 's3_pull-179', 'run_id': 'airt-service__2022-09-13T11:21:51.736479', 'state': 'running', 'execution_date': '2022-09-13T11:21:52+00:00', 'start_date': '2022-09-13T11:21:53.034039+00:00', 'end_date': ''}]


Path('/root/airflow/dags/s3_pull-179.py')

'airt-service__2022-09-13T11:21:51.736479'

'success'

In [None]:
# User sent request. Following lines part of airt-service before returning response to user

# abd_factory  = AirflowBashExecutor(executor_cli=,) # Does nothing except setting instance variables
# abd_factory.schedule(data_path_url, model_path_url, period="7 days") # Generates and save dag, unpauses it, will run periodically


# abd_factory.execute(data_path_url, model_path_url) # Generates and saves dag and runs it immediately?