Skip to content

Commit

Permalink
Update sample dag and doc for Datasync (#23511)
Browse files Browse the repository at this point in the history
  • Loading branch information
vincbeck committed May 9, 2022
1 parent 5d1e6ff commit ec4dcce
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 271 deletions.
Expand Up @@ -14,47 +14,21 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
This is an example dag for using `AWSDataSyncOperator` in a more complex manner.
- Try to get a TaskArn. If one exists, update it.
- If no tasks exist, try to create a new DataSync Task.
- If source and destination locations don't exist for the new task, create them first
- If many tasks exist, raise an Exception
- After getting or creating a DataSync Task, run it
This DAG relies on the following environment variables:
* SOURCE_LOCATION_URI - Source location URI, usually on premises SMB or NFS
* DESTINATION_LOCATION_URI - Destination location URI, usually S3
* CREATE_TASK_KWARGS - Passed to boto3.create_task(**kwargs)
* CREATE_SOURCE_LOCATION_KWARGS - Passed to boto3.create_location(**kwargs)
* CREATE_DESTINATION_LOCATION_KWARGS - Passed to boto3.create_location(**kwargs)
* UPDATE_TASK_KWARGS - Passed to boto3.update_task(**kwargs)
"""

import json
import re
from datetime import datetime
from os import getenv

from airflow import models
from airflow.models.baseoperator import chain
from airflow.providers.amazon.aws.operators.datasync import DataSyncOperator

# [START howto_operator_datasync_2_args]
TASK_ARN = getenv("TASK_ARN", "my_aws_datasync_task_arn")
SOURCE_LOCATION_URI = getenv("SOURCE_LOCATION_URI", "smb://hostname/directory/")

DESTINATION_LOCATION_URI = getenv("DESTINATION_LOCATION_URI", "s3://mybucket/prefix")

default_create_task_kwargs = '{"Name": "Created by Airflow"}'
CREATE_TASK_KWARGS = json.loads(getenv("CREATE_TASK_KWARGS", default_create_task_kwargs))

default_create_source_location_kwargs = "{}"
CREATE_SOURCE_LOCATION_KWARGS = json.loads(
getenv("CREATE_SOURCE_LOCATION_KWARGS", default_create_source_location_kwargs)
)

bucket_access_role_arn = "arn:aws:iam::11112223344:role/r-11112223344-my-bucket-access-role"
CREATE_TASK_KWARGS = json.loads(getenv("CREATE_TASK_KWARGS", '{"Name": "Created by Airflow"}'))
CREATE_SOURCE_LOCATION_KWARGS = json.loads(getenv("CREATE_SOURCE_LOCATION_KWARGS", '{}'))
default_destination_location_kwargs = """\
{"S3BucketArn": "arn:aws:s3:::mybucket",
"S3Config": {"BucketAccessRoleArn":
Expand All @@ -63,23 +37,33 @@
CREATE_DESTINATION_LOCATION_KWARGS = json.loads(
getenv("CREATE_DESTINATION_LOCATION_KWARGS", re.sub(r"[\s+]", '', default_destination_location_kwargs))
)

default_update_task_kwargs = '{"Name": "Updated by Airflow"}'
UPDATE_TASK_KWARGS = json.loads(getenv("UPDATE_TASK_KWARGS", default_update_task_kwargs))

# [END howto_operator_datasync_2_args]
UPDATE_TASK_KWARGS = json.loads(getenv("UPDATE_TASK_KWARGS", '{"Name": "Updated by Airflow"}'))

with models.DAG(
"example_datasync_2",
"example_datasync",
schedule_interval=None, # Override to match your needs
start_date=datetime(2021, 1, 1),
catchup=False,
tags=['example'],
) as dag:
# [START howto_operator_datasync_specific_task]
# Execute a specific task
datasync_specific_task = DataSyncOperator(task_id="datasync_specific_task", task_arn=TASK_ARN)
# [END howto_operator_datasync_specific_task]

# [START howto_operator_datasync_search_task]
# Search and execute a task
datasync_search_task = DataSyncOperator(
task_id="datasync_search_task",
source_location_uri=SOURCE_LOCATION_URI,
destination_location_uri=DESTINATION_LOCATION_URI,
)
# [END howto_operator_datasync_search_task]

# [START howto_operator_datasync_2]
datasync_task = DataSyncOperator(
task_id="datasync_task",
# [START howto_operator_datasync_create_task]
# Create a task (the task does not exist)
datasync_create_task = DataSyncOperator(
task_id="datasync_create_task",
source_location_uri=SOURCE_LOCATION_URI,
destination_location_uri=DESTINATION_LOCATION_URI,
create_task_kwargs=CREATE_TASK_KWARGS,
Expand All @@ -88,4 +72,10 @@
update_task_kwargs=UPDATE_TASK_KWARGS,
delete_task_after_execution=True,
)
# [END howto_operator_datasync_2]
# [END howto_operator_datasync_create_task]

chain(
datasync_specific_task,
datasync_search_task,
datasync_create_task,
)
69 changes: 0 additions & 69 deletions airflow/providers/amazon/aws/example_dags/example_datasync_1.py

This file was deleted.

2 changes: 1 addition & 1 deletion airflow/providers/amazon/aws/operators/datasync.py
Expand Up @@ -31,7 +31,7 @@


class DataSyncOperator(BaseOperator):
r"""Find, Create, Update, Execute and Delete AWS DataSync Tasks.
"""Find, Create, Update, Execute and Delete AWS DataSync Tasks.
If ``do_xcom_push`` is True, then the DataSync TaskArn and TaskExecutionArn
which were executed will be pushed to an XCom.
Expand Down

0 comments on commit ec4dcce

Please sign in to comment.