In [None]:
etl_dag = DAG (dag_id = 'etl_pipeline',
              default_args = {'start_date':'2020-01-08'})

In [None]:
# airflow run command (shell)
airflow run <dag_id> <task_id> <start_date> 

In [None]:
from airflow.models import DAG
from datetime import datetime

default_arguments = {
    'owner':'Thulasiram',
    'email':'tulasiram.gunipati@gmail.com',
    'start_date':datetime(2020, 1, 20),
    'retries':2
}

etl_dag = DAG('etl_workflow', default_args = default_arguments)

# airflow -h for descriptions
# airflow list_dags to show all recognized DAGs

part1 = BashOperator(
    task_id = 'generate_random_number'
    bash_command = 'echo $RANDOM',
    dag = dag
)

In [None]:
# Airflow command to start the server
airflow webserver -p port_number

In [None]:
from airflow.models import DAG
from airflow.operators.bash_opearator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.http_operator import SimpleHttpOperator

dag = DAG(
dag_id = 'update_start',
default_args = {"start_date":"2019-10-01"})

part1 = BashOperator(
    task_id = 'generate_random_number',
    bash_command = 'echo $RANDOM',
    dag = dag   
)

import sys
def python_version():
    return sys.version

part2 = PythonOperator(
    task_id = 'get_python_version',
    python_callable = python_version,
    dag = dag
)

part3 = SimpleHttpOperator(
    task_id = 'query_server_for_external_ip',
    endpoint = 'https://api.ipify.org',
    method = 'GET',
    dag = dag
)

part 3 >> part2

#### Operators

In [None]:
+ Operators are available in the airflow.operators / airflow.contrib.operators libraries
+ Represents a single task in a workflow
+ Run independently (usually)
+ Generally do not share information
+ Various operators to perform different tasks
+ using BatchOperator we can move from running individual bash scripts to airflow 

#### Tasks

+ Instances of operators
+ Task Dependencies
+ Reffered to upstream or downstream
+ Defined using bitshift operators
+ >> upstream operator
+ << downstream operator

In [None]:
# Define a new pull_sales task
pull_sales = BashOperator(
    task_id='pullsales_task',
    bash_command = 'wget https://salestracking/latestinfo?json',
    dag=analytics_dag
)

# Set pull_sales to run prior to cleanup
pull_sales >> cleanup

# Configure consolidate to run after cleanup
cleanup >> consolidate

# Set push_data to run last
consolidate >> push_data

In [None]:
from airflow.operators.python_operator import PythonOperator
def sleep(length_of_time):
    time.sleep(length_of_time)
    
sleep_task = PythonOperator(
    task_id = 'sleep',
    python_callable = sleep,
    op_kwargs = {'length_of_time':5},
    dag = example_dag
)


In [None]:
# Airflow contain emailOperator as well
from airflow.operators.email_operator import EmailOperator

email_task = EmailOperator(
    task_id = 'email_sales_report',
    to = 'sales_manager@example.com',
    subject = "Automated report",
    html_content = 'Attached is the latest report',
    files = 'latest_sales.xlsx',
    dag = example_dag
)

In [None]:
# Code to download data from url

def pull_file(URL, savepath):
    r = requests.get(URL)
    with open(savepath, 'wb') as f:
        f.write(r.content)   
    # Use the print method for logging
    print(f"File pulled from {URL} and saved to {savepath}")

from airflow.operators.python_operator import PythonOperator

# Create the task
pull_file_task = PythonOperator(
    task_id='pull_file',
    # Add the callable
    python_callable=pull_file,
    # Define the arguments
    op_kwargs={'URL':'http://dataserver/sales.json', 'savepath':'latestsales.json'},
    dag=process_sales_dag
)

#### Scheduling 

In [None]:
start_date - Date / time to initially schedule the DAG run
end_date - Optional attribute for when to stop running new DAG instances
max_tries - How many attempts to make
schedule_interval - How often to run - cron syntax or built in presets

#### cron syntax
minute - (0 - 59)
hour - (0 - 23)
day of the month - (1 - 31)
month - (1 - 12)
day of the week - (0 - 6)(sunday to saturday)

# preset    cron equivalent
@hourly     0 * * * *
@daily      0 0 * * *
@weekly     0 0 * * 0
None - used for manual triggered DAGs
@once - schedule only once

In [None]:
# Update the scheduling arguments as defined
default_args = {
  'owner': 'Engineering',
  'start_date': datetime(2019, 11, 1),
  'email': ['airflowresults@datacamp.com'],
  'email_on_failure': False,
  'email_on_retry': False,
  'retries': 3,
  'retry_delay': timedelta(minutes=20)
}

# schedule for every wednesday at 12.30
dag = DAG('update_dataflows', default_args=default_args, schedule_interval='30 12 * * 3')

#### sensor

+ An operator that waits for a certain condition to be true
+ Can be defined how often to check for the condition to be true
+ They are a type of operators and can be assigned to tasks

In [None]:
from airflow.sensors.base_sensor_operator
mode = 'poke' run repeatedly
mode = 'reschedule' - Give up a task slot and try again later
timeout in seconds
Sensors also consists of operator agruments

In [None]:
# File sensor
from airflow.contrib.sensors.file_sensor import FileSensor
file_sensor_task = FileSensor(task_id = 'file_sense',
                             filepath = 'salesdata.csv',
                             poke_interval = 300,
                             dag = sales_report_dag)
init_sales_cleanup >> file_sensor_task >> generate_report

# ExternalTaskSensor - wait for a task in another DAG to complete
# HttpSensor - Request a web URL and check for content
# SqlSensor - Runs a SQL query to check for content

##### When to use a Sensor
+ Uncertain when it will be true
+ If failure not immediately desired
+ To add task repetition without loops

#### Executors

In [None]:
+ Executors run tasks
+ Example executors - SequentialExecutor, LocalExecutor, CeleryExecutor
+ We can know the type of executor by looking at the airflow.cfg file
+ cat airflow/airflow.cfg | grep "executor ="
+ we can also know this from airflow list_dags