### Integrating with Apache Airflow
**Description**: Integrate Great Expectations with Apache Airflow to run data quality checks automatically in your DAG.

**Steps**:
1. Install Airflow (if you haven't already):
2. Airflow DAG Integration:
    - Create a DAG file:
3. Deploy and Test:
    - Place this file in your Airflow DAGs directory and start your Airflow scheduler.
    - Open the Airflow UI and trigger the DAG to see it run your expectations.

In [8]:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python import PythonOperator
import great_expectations as ge
from great_expectations.checkpoint import Checkpoint
import os

# === DAG Configuration ===

default_args = {
    'owner': 'airflow',
    'start_date': datetime(2024, 1, 1),
    'email_on_failure': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG(
    'ge_data_quality_check',
    default_args=default_args,
    description='Run Great Expectations data validation in Airflow',
    schedule_interval='@daily',
    catchup=False
)

# === Task Function ===

def run_data_validation():
    try:
        # Step 1: Load GE context
        context = ge.get_context()

        # Optional: Print all checkpoints
        print("Available Checkpoints:", context.list_checkpoints())

        # Step 2: Run a predefined checkpoint
        results = context.run_checkpoint(checkpoint_name="my_checkpoint")

        # Step 3: Evaluate result
        if not results["success"]:
            raise ValueError("❌ Data quality checks failed!")
        print("✅ All expectations passed.")

    except Exception as e:
        print(f"❌ Error during data validation: {str(e)}")
        raise

# === Airflow Operator ===

validate_task = PythonOperator(
    task_id='run_great_expectations',
    python_callable=run_data_validation,
    dag=dag
)

validate_task


ModuleNotFoundError: No module named 'airflow'

In [9]:
# Using inline expectations for test-like use
df = ge.from_pandas(pd.DataFrame({"age": [21, 22, 24]}))
result = df.expect_column_values_to_be_between("age", min_value=20, max_value=30)
assert result.success, "Validation failed!"


NameError: name 'ge' is not defined