In [1]:
# Question: Data Quality Automation Tools - Introduction to Great Expectations
# Description: Set up a simple Great Expectations check for missing values in a numeric column.
# First install Great Expectations if you haven't:
# pip install great_expectations

import great_expectations as gx
import pandas as pd

# 1. Create a simple Pandas DataFrame with a numeric column
data = {
    'id': [1, 2, 3, 4, 5],
    'age': [25, 30, None, 42, None],  # Intentionally added missing values
    'income': [50000, 60000, 75000, 48000, 90000]
}
df = pd.DataFrame(data)

# 2. Initialize Great Expectations
context = gx.get_context()

# 3. Create a Data Asset (Pandas DataFrame)
data_asset = context.sources.pandas_default.add_dataframe_asset(
    name="my_dataframe_asset",
    dataframe=df
)

# 4. Build a Batch Request
batch_request = data_asset.build_batch_request()

# 5. Create an Expectation Suite
expectation_suite_name = "missing_values_suite"
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)

# 6. Create a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)

# 7. Add Expectations for Missing Values
validator.expect_column_values_to_not_be_null(
    column="age",
    meta={
        "notes": {
            "format": "markdown",
            "content": "This column should not contain null values as age is a required field."
        }
    }
)

# 8. Save the Expectation Suite
validator.save_expectation_suite(discard_failed_expectations=False)

# 9. Create a Checkpoint
checkpoint = context.add_or_update_checkpoint(
    name="missing_values_checkpoint",
    validator=validator,
)

# 10. Run the Validation
checkpoint_result = checkpoint.run()

# 11. View Results
print("\nValidation Results:")
print(checkpoint_result.list_validation_results())

# 12. Generate Data Docs (HTML report)
context.build_data_docs()
print("\nData Docs built. Open great_expectations/uncommitted/data_docs/local_site/index.html in your browser.")

# 13. You can also get a summary of the validation
validation_result = checkpoint_result.list_validation_results()[0]
print("\nMissing Value Check Summary:")
print(f"Column: age")
print(f"Success: {validation_result['results'][0]['success']}")
print(f"Missing Count: {validation_result['results'][0]['result']['unexpected_count']}")
print(f"Missing Percentage: {validation_result['results'][0]['result']['unexpected_percent']:.2f}%")



ModuleNotFoundError: No module named 'great_expectations'