In [1]:
# 1. Installation
# Run this in your shell/terminal once (not inside the script):
# !pip install great_expectations

# ---------------------------------------------------
# 2-6: Implementing Great Expectations Completeness Validation
# ---------------------------------------------------

import great_expectations as ge
from great_expectations.core.batch import BatchRequest
import os

def setup_great_expectations():
    try:
        # 2. Initialize Great Expectations in current working directory (if not done yet)
        if not os.path.exists("great_expectations"):
            from great_expectations.data_context import DataContext
            DataContext.create()

        # 3. Load Data Context
        context = ge.data_context.DataContext()

        # 4. Create or get an Expectation Suite for completeness checks
        suite_name = "completeness_suite"
        if suite_name not in context.list_expectation_suites():
            suite = context.create_expectation_suite(suite_name=suite_name, overwrite_existing=True)
        else:
            suite = context.get_expectation_suite(suite_name)

        return context, suite_name

    except Exception as e:
        print(f"Error setting up Great Expectations: {e}")
        return None, None

def validate_data_completeness(df, context, suite_name):
    try:
        # 5. Load data as a GE dataset (PandasDataset)
        ge_df = ge.from_pandas(df)

        # Add expectation: at least 95% completeness for every column
        for col in df.columns:
            ge_df.expect_column_values_to_not_be_null(col, mostly=0.95)

        # Save suite expectations to context
        context.save_expectation_suite(ge_df.get_expectation_suite(), suite_name)

        # 6. Run validation
        batch_request = {
            "datasource_name": "default_pandas_datasource",
            "data_connector_name": "default_runtime_data_connector",
            "data_asset_name": "runtime_data_asset",
            "runtime_parameters": {"batch_data": df},
            "batch_identifiers": {"default_identifier_name": "default_identifier"},
        }

        results = context.run_validation_operator(
            "action_list_operator",
            assets_to_validate=[batch_request],
            run_name="completeness_validation_run",
            expectation_suite_name=suite_name,
        )

        print("Validation Results Summary:")
        print(results['success'])
        return results['success']

    except Exception as e:
        print(f"Error during validation: {e}")
        return False

# ---------------- Main Execution ----------------

if __name__ == "__main__":
    import pandas as pd
    import numpy as np

    # Sample data with some missing values
    df = pd.DataFrame({
        "name": ["Alice", "Bob", None, "David", "Eve"],
        "age": [25, None, 35, 40, 22],
        "email": ["a@example.com", "b@example.com", None, "d@example.com", "e@example.com"]
    })

    print("Setting up Great Expectations context and suite...")
    context, suite_name = setup_great_expectations()

    if context and suite_name:
        print("Running data completeness validation...")
        passed = validate_data_completeness(df, context, suite_name)

        if passed:
            print("✅ Data completeness SLA passed!")
        else:
            print("❌ Data completeness SLA failed!")
    else:
        print("Failed to initialize Great Expectations.")


Setting up Great Expectations context and suite...
Error setting up Great Expectations: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)
Failed to initialize Great Expectations.
