## Plugin Class creation basics

1. First we will see the code without Plugin
2. Then we will create a custom Plugin

#### Code Without plugin.

In [None]:
import datetime
import logging

from airflow import DAG
from airflow.contrib.hooks.aws_hook import AwsHook
from airflow.hooks.postgres_hook import PostgresHook
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.python_operator import PythonOperator

import sql_statements

## function definition for the task for loading_trip_data_to_redshift
def load_trip_data_to_redshift(*args, **kwargs):
    aws_hook = AwsHook("aws_credentials")
    credentials = aws_hook.get_credentials()
    redshift_hook = PostgresHook("redshift")
    execution_date = kwargs["execution_date"]
    sql_stmt = sql_statements.COPY_MONTHLY_TRIPS_SQL.format(
        credentials.access_key,
        credentials.secret_key,
        year=execution_date.year,
        month=execution_date.month
    )
    redshift_hook.run(sql_stmt)

## function definition for the task for loading_station_data_to_redshift
def load_station_data_to_redshift(*args, **kwargs):
    aws_hook = AwsHook("aws_credentials")
    credentials = aws_hook.get_credentials()
    redshift_hook = PostgresHook("redshift")
    sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
        credentials.access_key,
        credentials.secret_key,
    )
    redshift_hook.run(sql_stmt)


## function definition for the code quality task for checking_greater_than_zero  
def check_greater_than_zero(*args, **kwargs):
    table = kwargs["params"]["table"]
    redshift_hook = PostgresHook("redshift")
    records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}")
    if len(records) < 1 or len(records[0]) < 1:
        raise ValueError(f"Data quality check failed. {table} returned no results")
    num_records = records[0][0]
    if num_records < 1:
        raise ValueError(f"Data quality check failed. {table} contained 0 rows")
    logging.info(f"Data quality on table {table} check passed with {records[0][0]} records")


## Dag definition
dag = DAG(
    'lesson2.exercise4',
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
    end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
    schedule_interval='@monthly',
    max_active_runs=1
)


## Task to create the trips table in redshift
create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL
)


## Task to load_trips_data_from_s3_to_redshift
copy_trips_task = PythonOperator(
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    python_callable=load_trip_data_to_redshift,
    provide_context=True,
)

## Task to check data quality
check_trips = PythonOperator(
    task_id='check_trips_data',
    dag=dag,
    python_callable=check_greater_than_zero,
    provide_context=True,
    params={
        'table': 'trips',
    }
)

## Task to create_stations_table
create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)


## Task to load_stations_data_from_s3_to_redshift
copy_stations_task = PythonOperator(
    task_id='load_stations_from_s3_to_redshift',
    dag=dag,
    python_callable=load_station_data_to_redshift,
)


## Task to check_stations_data 
check_stations = PythonOperator(
    task_id='check_stations_data',
    dag=dag,
    python_callable=check_greater_than_zero,
    provide_context=True,
    params={
        'table': 'stations',
    }
)


## dependencies
create_trips_table >> copy_trips_task
create_stations_table >> copy_stations_task
copy_stations_task >> check_stations
copy_trips_task >> check_trips

NOTE : We can see above that we have created a function with function name as 'check_greater_than_zero'; this function is being used multiple times, therefore, we can see that there is an opportunity that we can create an operator. This operator can be used instead of writing quality check function all the time while creating any pipeline. After creating the operator we just have to pass the parameters to the operator class to create a task.

#### -----------------------------------------------------------------------------

## Create Plugin: 

##### 1. has_rows plugin

###### -----------------------------------------------------------------
###### Before creating plugin we should Review Inheritance in Python.

In [4]:
# Example Python code to demonstrate how parent constructors 
# are called. 
  
# parent class 
class Person( object ):     
  
        # __init__ is known as the constructor          
        def __init__(self, name, idnumber):    
                self.name = name 
                self.idnumber = idnumber 
        def display(self): 
                print(self.name) 
                print(self.idnumber) 
  
# child class  
class Employee( Person ):            
        def __init__(self, name, idnumber, salary, post): 
                self.salary = salary 
                self.post = post 
  
                # invoking the __init__ of the parent class  
                Person.__init__(self, name, idnumber)
        
        def show(self):
            print(f"name is {self.name}; id is {self.idnumber}; salary is {self.salary};\
 post is {self.post}")
  
                  
# creation of an object variable or an instance 
a = Employee('Rahul', 886012, 1000000, "BD")     
  
# calling a function of the class Person using its instance 
a.show()  

name is Rahul; id is 886012; salary is 1000000; post is BD


### --------------------------------------------------------

##### Plugin contd....

##### Core idea behind creating a plugin

1. See the objects required for creating a class from the function definition.
 In our case we require: table name, redshift connection, and context variables args , kwargs
 
2. Since we know nthe parameters we need ; so now we can create a class passing the above objects as a parameters.

In [None]:
## function definition for the code quality task for checking_greater_than_zero  
def check_greater_than_zero(*args, **kwargs):
    table = kwargs["params"]["table"]
    redshift_hook = PostgresHook("redshift")
    records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}")
    if len(records) < 1 or len(records[0]) < 1:
        raise ValueError(f"Data quality check failed. {table} returned no results")
    num_records = records[0][0]
    if num_records < 1:
        raise ValueError(f"Data quality check failed. {table} contained 0 rows")
    logging.info(f"Data quality on table {table} check passed with {records[0][0]} records")
    
    
## From the function above require: table name, redshift connection, and context variables args , kwargs

In [None]:
## Plugin Definition

import logging # It is used to include information (generated by user) into the airflow logs. 
# Helpful in debugging the code

from airflow.hooks.postgres_hook import PostgresHook  

from airflow.models import BaseOperator # Base Operator : is a parent class inherited by all the plugins

from airflow.utils.decorators import apply_defaults # It is used as a decorator function around the class constructor  


class HasRowsOperator(BaseOperator):

    @apply_defaults
    def __init__(self, redshift_conn_id="", table="", *args, **kwargs):
        self.table = table
        self.redshift_conn_id = redshift_conn_id
        
        # invoking constructor of parent class
        super(HasRowsOperator, self).__init__(*args, **kwargs)

    def execute(self, context):
        redshift_hook = PostgresHook(self.redshift_conn_id)
        records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
        if len(records) < 1 or len(records[0]) < 1:
            raise ValueError(f"Data quality check failed. {self.table} returned no results")
        num_records = records[0][0]
        if num_records < 1:
            raise ValueError(f"Data quality check failed. {self.table} contained 0 rows")
        logging.info(f"Data quality on table {self.table} check passed with {records[0][0]} records")
        
# BaseOperator is a parent class and is a must for all the plugin creation as per airflow documentation.

# @apply_defaults: is adecorator function. Its usage is to be researched.

# def __init__(self, redshift_conn_id="", table="", *args, **kwargs):
#  pass the table name, redshift connection, and context variables to the 
# - constructor function of the HasRowsOperator() class.

# super(HasRowsOperator, self).__init__(*args, **kwargs) : is meant to invoke the constructor function of the parent calss

# def execute(self, context): This function contains all the operations which were there 
# - in the function definition of check_greater_than_zero(*args, **kwargs).
# - This function is a must for the operator creation. At the time of the run of this operator in a dag, the execute function is executed by the  airflow.

## This basic template is used to create any custom plugin.