In [None]:
etl_dag = DAG (dag_id = 'etl_pipeline',
              default_args = {'start_date':'2020-01-08'})

In [None]:
# airflow run command (shell)
airflow run <dag_id> <task_id> <start_date> 

In [None]:
from airflow.models import DAG
from datetime import datetime

default_arguments = {
    'owner':'Thulasiram',
    'email':'tulasiram.gunipati@gmail.com',
    'start_date':datetime(2020, 1, 20),
    'retries':2
}

etl_dag = DAG('etl_workflow', default_args = default_arguments)

# airflow -h for descriptions
# airflow list_dags to show all recognized DAGs

part1 = BashOperator(
    task_id = 'generate_random_number'
    bash_command = 'echo $RANDOM',
    dag = dag
)

In [None]:
# Airflow command to start the server
airflow webserver -p port_number

In [None]:
from airflow.models import DAG
from airflow.operators.bash_opearator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.http_operator import SimpleHttpOperator

dag = DAG(
dag_id = 'update_start',
default_args = {"start_date":"2019-10-01"})

part1 = BashOperator(
    task_id = 'generate_random_number',
    bash_command = 'echo $RANDOM',
    dag = dag   
)

import sys
def python_version():
    return sys.version

part2 = PythonOperator(
    task_id = 'get_python_version',
    python_callable = python_version,
    dag = dag
)

part3 = SimpleHttpOperator(
    task_id = 'query_server_for_external_ip',
    endpoint = 'https://api.ipify.org',
    method = 'GET',
    dag = dag
)

part 3 >> part2

#### Operators

In [None]:
+ Operators are available in the airflow.operators / airflow.contrib.operators libraries
+ Represents a single task in a workflow
+ Run independently (usually)
+ Generally do not share information
+ Various operators to perform different tasks
+ using BatchOperator we can move from running individual bash scripts to airflow 

#### Tasks

+ Instances of operators
+ Task Dependencies
+ Reffered to upstream or downstream
+ Defined using bitshift operators
+ >> upstream operator
+ << downstream operator

In [None]:
# Define a new pull_sales task
pull_sales = BashOperator(
    task_id='pullsales_task',
    bash_command = 'wget https://salestracking/latestinfo?json',
    dag=analytics_dag
)

# Set pull_sales to run prior to cleanup
pull_sales >> cleanup

# Configure consolidate to run after cleanup
cleanup >> consolidate

# Set push_data to run last
consolidate >> push_data

In [None]:
from airflow.operators.python_operator import PythonOperator
def sleep(length_of_time):
    time.sleep(length_of_time)
    
sleep_task = PythonOperator(
    task_id = 'sleep',
    python_callable = sleep,
    op_kwargs = {'length_of_time':5},
    dag = example_dag
)


In [None]:
# Airflow contain emailOperator as well
from airflow.operators.email_operator import EmailOperator

email_task = EmailOperator(
    task_id = 'email_sales_report',
    to = 'sales_manager@example.com',
    subject = "Automated report",
    html_content = 'Attached is the latest report',
    files = 'latest_sales.xlsx',
    dag = example_dag
)

In [None]:
# Code to download data from url

def pull_file(URL, savepath):
    r = requests.get(URL)
    with open(savepath, 'wb') as f:
        f.write(r.content)   
    # Use the print method for logging
    print(f"File pulled from {URL} and saved to {savepath}")

from airflow.operators.python_operator import PythonOperator

# Create the task
pull_file_task = PythonOperator(
    task_id='pull_file',
    # Add the callable
    python_callable=pull_file,
    # Define the arguments
    op_kwargs={'URL':'http://dataserver/sales.json', 'savepath':'latestsales.json'},
    dag=process_sales_dag
)

#### Scheduling 

In [None]:
start_date - Date / time to initially schedule the DAG run
end_date - Optional attribute for when to stop running new DAG instances
max_tries - How many attempts to make
schedule_interval - How often to run - cron syntax or built in presets

#### cron syntax
minute - (0 - 59)
hour - (0 - 23)
day of the month - (1 - 31)
month - (1 - 12)
day of the week - (0 - 6)(sunday to saturday)

# preset    cron equivalent
@hourly     0 * * * *
@daily      0 0 * * *
@weekly     0 0 * * 0
None - used for manual triggered DAGs
@once - schedule only once

In [None]:
# Update the scheduling arguments as defined
default_args = {
  'owner': 'Engineering',
  'start_date': datetime(2019, 11, 1),
  'email': ['airflowresults@datacamp.com'],
  'email_on_failure': False,
  'email_on_retry': False,
  'retries': 3,
  'retry_delay': timedelta(minutes=20)
}

# schedule for every wednesday at 12.30
dag = DAG('update_dataflows', default_args=default_args, schedule_interval='30 12 * * 3')

#### sensor

+ An operator that waits for a certain condition to be true
+ Can be defined how often to check for the condition to be true
+ They are a type of operators and can be assigned to tasks

In [None]:
from airflow.sensors.base_sensor_operator
mode = 'poke' run repeatedly
mode = 'reschedule' - Give up a task slot and try again later
timeout in seconds
Sensors also consists of operator agruments

In [None]:
# File sensor
from airflow.contrib.sensors.file_sensor import FileSensor
file_sensor_task = FileSensor(task_id = 'file_sense',
                             filepath = 'salesdata.csv',
                             poke_interval = 300,
                             dag = sales_report_dag)
init_sales_cleanup >> file_sensor_task >> generate_report

# ExternalTaskSensor - wait for a task in another DAG to complete
# HttpSensor - Request a web URL and check for content
# SqlSensor - Runs a SQL query to check for content

##### When to use a Sensor
+ Uncertain when it will be true
+ If failure not immediately desired
+ To add task repetition without loops

#### Executors

In [None]:
+ Executors run tasks
+ Example executors - SequentialExecutor, LocalExecutor, CeleryExecutor
+ We can know the type of executor by looking at the airflow.cfg file
+ cat airflow/airflow.cfg | grep "executor ="
+ we can also know this from airflow list_dags

#### SLA's

In [None]:
+ Use 'sla' argument on the task
task 1 = BashOperator(task_id = 'sla_task',
                     bash_command = 'runcode.sh',
                     sla = timedelta(seconds = 30),
                     dag = dag)
default_args = {
    'sla': timedelta(minutes = 20),
    'start_date':datetime(2020,2,20)
}

dag = DAG('sla_dag', default_args = default_args)

In [None]:
# Import the timedelta object
from datetime import timedelta

# Create the dictionary entry
default_args = {
  'start_date': datetime(2020, 2, 20),
  'sla': timedelta(minutes = 30)
}

# Add to the DAG
test_dag = DAG('test_workflow', default_args=default_args, schedule_interval='@None')


# Import the timedelta object
from datetime import timedelta

test_dag = DAG('test_workflow', start_date=datetime(2020,2,20), schedule_interval='@None')

# Create the task with the SLA
task1 = BashOperator(task_id='first_task',
                     sla=timedelta(hours = 3),
                     bash_command='initialize_data.sh',
                     dag=test_dag)

In [None]:
# Define the email task
email_report = EmailOperator(
        task_id='email_report',
        to='airflow@datacamp.com',
        subject='Airflow Monthly Report',
        html_content="""Attached is your monthly workflow report - please refer to it for more detail""",
        files=['monthly_report.pdf'],
        dag=report_dag
)

# Set the email task to run after the report is generated
email_report << generate_report

In [None]:
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from datetime import datetime

default_args={
    'email_on': ['airflowalerts@datacamp.com','airflowadmin@datacamp.com'],
    'on_failure': True,
    'on_success': True
}
report_dag = DAG(
    dag_id = 'execute_report',
    schedule_interval = "0 0 * * *",
    default_args=default_args
)

precheck = FileSensor(
    task_id='check_for_datafile',
    filepath='salesdata_ready.csv',
    start_date=datetime(2020,2,20),
    mode='reschedule',
    dag=report_dag)

generate_report_task = BashOperator(
    task_id='generate_report',
    bash_command='generate_report.sh',
    start_date=datetime(2020,2,20),
    dag=report_dag
)

precheck >> generate_report_task

#### Template

+ Allow substituting information during a DAG run
+ Provide added flexibility when defining tasks
+ Are created using the jinja templating language

In [None]:
templated_command="""
    echo "Reading {{params.filename}}"
"""

t1 = BashOperator(task_id = 'template_task',
                 bash_command = templated_command,
                 params = {'filename':'file1.txt'},
                 dag = example_dag)

In [None]:
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime

default_args = {
  'start_date': datetime(2020, 4, 15),
}

cleandata_dag = DAG('cleandata',
                    default_args=default_args,
                    schedule_interval='@daily')

# Create a templated command to execute
# 'bash cleandata.sh datestring'

templated_command="""
    bash cleandata.sh {{ds_nodash}}
"""

# Modify clean_task to use the templated command
clean_task = BashOperator(task_id='cleandata_task',
                          bash_command=templated_command,
                          dag=cleandata_dag)

In [None]:
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime

default_args = {
  'start_date': datetime(2020, 4, 15),
}

cleandata_dag = DAG('cleandata',
                    default_args=default_args,
                    schedule_interval='@daily')

# Modify the templated command to handle a
# second argument called filename.
templated_command = """
  bash cleandata.sh {{ ds_nodash }} {{params.filename}}
"""

# Modify clean_task to pass the new argument
clean_task = BashOperator(task_id='cleandata_task',
                          bash_command=templated_command,
                          params={'filename': 'salesdata.txt'},
                          dag=cleandata_dag)

# Create a new BashOperator clean_task2
clean_task2 = BashOperator(task_id='cleandata_task2',
                           bash_command = templated_command,
                           params = {'filename':'supportdata.txt'},
                           dag = cleandata_dag)
                           
# Set the operator dependencies
clean_task >> clean_task2

#### Jinja templates

In [None]:
templated_command = """
{% for filename in params.filenames %}
    echo "Reading {{ filename }}"
{% endfor %}
"""

t1 = BashOperator(task_id = 'template_task',
                 bash_command = templated_command,
                 params = {'filenames': ['file1.txt','file2.txt']}
                 dag = example_dag)

In [None]:
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime

filelist = [f'file{x}.txt' for x in range(30)]

default_args = {
  'start_date': datetime(2020, 4, 15),
}

cleandata_dag = DAG('cleandata',
                    default_args=default_args,
                    schedule_interval='@daily')

# Modify the template to handle multiple files in a 
# single run.
templated_command = """
  <% for filename in params.filenames %>
  bash cleandata.sh {{ ds_nodash }} {{ filename }};
  <% endfor %>
"""

# Modify clean_task to use the templated command
clean_task = BashOperator(task_id='cleandata_task',
                          bash_command=templated_command,
                          params={'filenames': filelist},
                          dag=cleandata_dag)

In [None]:
from airflow.models import DAG
from airflow.operators.email_operator import EmailOperator
from datetime import datetime

# Create the string representing the html email content
html_email_str = """
Date: {{ ds }}
Username: {{ params.username }}
"""

email_dag = DAG('template_email_test',
                default_args={'start_date': datetime(2020, 4, 15)},
                schedule_interval='@weekly')
                
email_task = EmailOperator(task_id='email_task',
                           to='testuser@datacamp.com',
                           subject="{{ macros.uuid.uuid4() }}",
                           html_content=html_email_str,
                           params={'username': 'testemailuser'},
                           dag=email_dag)

#### Branching

In [None]:
def branch_test(**kwargs):
    if int(kwargs['ds_nodash']) % 2 == 0:
        return 'even_day_task'
    else:
        return 'odd_day_task'
    
branch_task = BranchPythonOperator(task_id = 'branch_task', dag = dag,
                                  provide_context = True,
                                  python_callable=branch_test)

In [None]:
# Create a function to determine if years are different
def year_check(**kwargs):
    current_year = int(kwargs['ds_nodash'][0:4])
    previous_year = int(kwargs['prev_ds_nodash'][0:4])
    if current_year == previous_year:
        return 'current_year_task'
    else:
        return 'new_year_task'

# Define the BranchPythonOperator
branch_task = BranchPythonOperator(task_id='branch_task', dag=branch_dag,
                                   python_callable=year_check, provide_context=True)
# Define the dependencies
branch_dag >> current_year_task
branch_dag >> new_year_task

In [None]:
from airflow.models import DAG
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime

dag = DAG('BranchingTest', default_args={'start_date': datetime(2020, 4, 15)}, schedule_interval='@daily')

def branch_test(**kwargs):
  if int(kwargs['ds_nodash']) % 2 == 0:
    return 'even_day_task'
  else:
    return 'odd_day_task'
 
start_task = DummyOperator(task_id='start_task', dag=dag)

branch_task = BranchPythonOperator(
       task_id='branch_task',
       provide_context=True,
       python_callable=branch_test,
       dag=dag)

even_day_task = DummyOperator(task_id='even_day_task', dag=dag)
even_day_task2 = DummyOperator(task_id='even_day_task2', dag=dag)

odd_day_task = DummyOperator(task_id='odd_day_task', dag=dag)
odd_day_task2 = DummyOperator(task_id='odd_day_task2', dag=dag)

start_task >> branch_task 
even_day_task >> even_day_task2
odd_day_task >> odd_day_task2

In [None]:
# To run a specific task from command-line
airflow run <dag_id> <task_id> <date>

# To run a full DAG:
airflow trigger_dag -e <date> <dag_id>

In [None]:
from airflow.models import DAG
from airflow.contrib.sensors.file_sensor import FileSensor

# Import the needed operators
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from datetime import date, datetime

def process_data(**context):
  file = open('/home/repl/workspace/processed_data.tmp', 'w')
  file.write(f'Data processed on {date.today()}')
  file.close()

    
dag = DAG(dag_id='etl_update', default_args={'start_date': datetime(2020,4,1)})

sensor = FileSensor(task_id='sense_file', 
                    filepath='/home/repl/workspace/startprocess.txt',
                    poke_interval=5,
                    timeout=15,
                    dag=dag)

bash_task = BashOperator(task_id='cleanup_tempfiles', 
                         bash_command='rm -f /home/repl/*.tmp',
                         dag=dag)

python_task = PythonOperator(task_id='run_processing', 
                             python_callable=process_data,
                             dag=dag)

sensor >> bash_task >> python_task


In [None]:
from airflow.models import DAG
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from dags.process import process_data
from datetime import timedelta, datetime

# Update the default arguments and apply them to the DAG
default_args = {
  'start_date': datetime(2019,1,1),
  'sla':timedelta(minutes = 90)
}

dag = DAG(dag_id='etl_update', default_args=default_args)

sensor = FileSensor(task_id='sense_file', 
                    filepath='/home/repl/workspace/startprocess.txt',
                    poke_interval = 45,
                    dag=dag)

bash_task = BashOperator(task_id='cleanup_tempfiles', 
                         bash_command='rm -f /home/repl/*.tmp',
                         dag=dag)

python_task = PythonOperator(task_id='run_processing', 
                             python_callable=process_data,
                             provide_context = True,
                             dag=dag)

sensor >> bash_task >> python_task

In [None]:
from airflow.models import DAG
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.email_operator import EmailOperator
from dags.process import process_data
from datetime import datetime, timedelta

# Update the default arguments and apply them to the DAG.

default_args = {
  'start_date': datetime(2019,1,1),
  'sla': timedelta(minutes=90)
}
    
dag = DAG(dag_id='etl_update', default_args=default_args)

sensor = FileSensor(task_id='sense_file', 
                    filepath='/home/repl/workspace/startprocess.txt',
                    poke_interval=45,
                    dag=dag)

bash_task = BashOperator(task_id='cleanup_tempfiles', 
                         bash_command='rm -f /home/repl/*.tmp',
                         dag=dag)

python_task = PythonOperator(task_id='run_processing', 
                             python_callable=process_data,
                             provide_context=True,
                             dag=dag)


email_subject="""
  Email report for {{ params.department }} on {{ ds_nodash }}
"""


email_report_task = EmailOperator(task_id='email_report_task',
                                  to='sales@mycompany.com',
                                  subject=email_subject,
                                  html_content='email_subject',
                                  params={'department': 'Data subscription services'},
                                  dag=dag)


no_email_task = DummyOperator(task_id='no_email_task', dag=dag)


def check_weekend(**kwargs):
    dt = datetime.strptime(kwargs['execution_date'],"%Y-%m-%d")
    # If dt.weekday() is 0-4, it's Monday - Friday. If 5 or 6, it's Sat / Sun.
    if (dt.weekday() < 5):
        return 'email_report_task'
    else:
        return 'no_email_task'
    
    
branch_task = BranchPythonOperator(task_id='check_if_weekend',
                                   python_callable = check_weekend,
                                   provide_context = True,
                                   dag=dag)

    
sensor >> bash_task >> python_task

python_task >> branch_task >> [email_report_task, no_email_task]
