In [2]:
import great_expectations as ge
import os
import pandas as pd

def initialize_ge_context():
    """Initialize or load Great Expectations Data Context and create/get an expectation suite."""
    try:
        if not os.path.exists("great_expectations"):
            from great_expectations.data_context import DataContext
            DataContext.create()
        context = ge.data_context.DataContext()

        suite_name = "completeness_suite"
        if suite_name not in context.list_expectation_suites():
            context.create_expectation_suite(suite_name=suite_name, overwrite_existing=True)
        return context, suite_name
    except Exception as err:
        print(f"Error initializing GE context: {err}")
        return None, None

def is_valid_dataframe(df):
    """Check if the DataFrame is non-empty and contains columns."""
    if not isinstance(df, pd.DataFrame):
        print("Input is not a DataFrame.")
        return False
    if df.empty:
        print("Input DataFrame is empty.")
        return False
    if df.columns.size == 0:
        print("Input DataFrame has no columns.")
        return False
    return True

def validate_completeness(df, context, suite_name, threshold=0.95):
    """
    Run completeness validation on each column in df with the specified threshold.
    Returns True if validation passes, else False.
    """
    if not is_valid_dataframe(df):
        print("DataFrame validation failed before running Great Expectations.")
        return False

    try:
        ge_df = ge.from_pandas(df)

        for column in df.columns:
            ge_df.expect_column_values_to_not_be_null(column, mostly=threshold)

        context.save_expectation_suite(ge_df.get_expectation_suite(), suite_name)

        batch_request = {
            "datasource_name": "default_pandas_datasource",
            "data_connector_name": "default_runtime_data_connector",
            "data_asset_name": "runtime_data_asset",
            "runtime_parameters": {"batch_data": df},
            "batch_identifiers": {"default_identifier_name": "default_identifier"},
        }

        results = context.run_validation_operator(
            "action_list_operator",
            assets_to_validate=[batch_request],
            run_name="completeness_validation_run",
            expectation_suite_name=suite_name,
        )
        return results["success"]
    except Exception as err:
        print(f"Validation error: {err}")
        return False

# --- Unit tests ---

import unittest

class TestGreatExpectationsCompleteness(unittest.TestCase):

    def setUp(self):
        self.context, self.suite_name = initialize_ge_context()
        self.valid_df = pd.DataFrame({
            "name": ["Alice", "Bob", "Charlie", "David"],
            "age": [25, 30, 35, 40],
            "email": ["a@example.com", "b@example.com", "c@example.com", "d@example.com"]
        })
        self.empty_df = pd.DataFrame()
        self.no_columns_df = pd.DataFrame([])

    def test_is_valid_dataframe(self):
        self.assertTrue(is_valid_dataframe(self.valid_df))
        self.assertFalse(is_valid_dataframe(self.empty_df))
        self.assertFalse(is_valid_dataframe(self.no_columns_df))
        self.assertFalse(is_valid_dataframe(None))
        self.assertFalse(is_valid_dataframe("not a df"))

    def test_validate_completeness_pass(self):
        if self.context and self.suite_name:
            result = validate_completeness(self.valid_df, self.context, self.suite_name)
            self.assertIsInstance(result, bool)
            self.assertTrue(result)

    def test_validate_completeness_fail_empty_df(self):
        if self.context and self.suite_name:
            result = validate_completeness(self.empty_df, self.context, self.suite_name)
            self.assertFalse(result)

    def test_validate_completeness_fail_no_context(self):
        result = validate_completeness(self.valid_df, None, None)
        self.assertFalse(result)

if __name__ == "__main__":
    # Example execution with sample data
    sample_df = pd.DataFrame({
        "name": ["Alice", "Bob", None, "David", "Eve"],
        "age": [25, None, 35, 40, 22],
        "email": ["a@example.com", "b@example.com", None, "d@example.com", "e@example.com"]
    })

    context, suite = initialize_ge_context()
    if context and suite:
        passed = validate_completeness(sample_df, context, suite)
        print("Data Completeness Validation Passed" if passed else "Validation Failed")
    else:
        print("Failed to initialize Great Expectations.")

    # Run unit tests
    unittest.main(argv=['first-arg-is-ignored'], exit=False)


....
----------------------------------------------------------------------
Ran 4 tests in 0.007s

OK


Error initializing GE context: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)
Failed to initialize Great Expectations.
Error initializing GE context: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)
Input DataFrame is empty.
Input DataFrame is empty.
Input is not a DataFrame.
Input is not a DataFrame.
Error initializing GE context: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)
Error initializing GE context: cannot import name 'DataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)
Validation error: module 'great_expectations' has no attribute 