## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations

### Profile a JSON dataset with product sales data to check for null values in the 'ProductID' and 'Price' fields.
- Create an expectation suite and connect it to the data context.
- Use the `expect_column_values_to_not_be_null` expectation to profile these fields.
- Review the summary to identify any unexpected null values.

In [1]:
# write your code from here
import pandas as pd
import great_expectations as ge
from great_expectations.core.batch import BatchRequest
from great_expectations.checkpoint import SimpleCheckpoint
import os

# ---------------------- Step 0: Create Sample In-Memory Data ----------------------
try:
    # Simulated JSON-style product sales data (for profiling)
    sales_data = pd.DataFrame({
        'ProductID': [101, 102, None, 104],
        'ProductName': ['Pen', 'Notebook', 'Eraser', 'Pencil'],
        'Price': [10.5, 20.0, None, 5.75]
    })

    # Simulated API-like data ingestion (status validation)
    api_data = pd.DataFrame({
        'UserID': [1, 2, 3, 4],
        'Status': ['Active', 'Inactive', 'Pending', 'Active']  # 'Pending' is invalid
    })
except Exception as e:
    raise RuntimeError(f"Error during data creation: {e}")

# ---------------------- Step 1: Setup Great Expectations Context ----------------------
project_dir = "ge_project"
os.makedirs(project_dir, exist_ok=True)
context = ge.get_context(context_root_dir=project_dir)

# ---------------------- Step 2: Register Datasource ----------------------
context.datasources["pandas_datasource"] = {
    "class_name": "Datasource",
    "execution_engine": {"class_name": "PandasExecutionEngine"},
    "data_connectors": {
        "runtime_data": {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["default_id"],
        }
    }
}
context.save_datasource("pandas_datasource")

# ---------------------- Step 3: Create Expectation Suite for JSON Profiling ----------------------
suite_name_profile = "product_sales_suite"
context.create_expectation_suite(suite_name_profile, overwrite_existing=True)

batch_request_profile = BatchRequest(
    datasource_name="pandas_datasource",
    data_connector_name="runtime_data",
    data_asset_name="product_sales_data",
    runtime_parameters={"batch_data": sales_data},
    batch_identifiers={"default_id": "prod_001"},
)

validator_profile = context.get_validator(
    batch_request=batch_request_profile,
    expectation_suite_name=suite_name_profile
)

# Expect no nulls in 'ProductID' and 'Price'
validator_profile.expect_column_values_to_not_be_null("ProductID")
validator_profile.expect_column_values_to_not_be_null("Price")
validator_profile.save_expectation_suite()

# ---------------------- Step 4: Create Expectation Suite for API Status Validation ----------------------
suite_name_api = "status_validation_suite"
context.create_expectation_suite(suite_name_api, overwrite_existing=True)

batch_request_api = BatchRequest(
    datasource_name="pandas_datasource",
    data_connector_name="runtime_data",
    data_asset_name="api_status_data",
    runtime_parameters={"batch_data": api_data},
    batch_identifiers={"default_id": "api_001"},
)

validator_api = context.get_validator(
    batch_request=batch_request_api,
    expectation_suite_name=suite_name_api
)

# Expect values in 'Status' to be either 'Active' or 'Inactive'
validator_api.expect_column_values_to_be_in_set("Status", ["Active", "Inactive"])
validator_api.save_expectation_suite()

# ---------------------- Step 5: Checkpoints ----------------------

# JSON Profiling Checkpoint
checkpoint_profile = SimpleCheckpoint(
    name="product_sales_checkpoint",
    data_context=context,
    validations=[{
        "batch_request": batch_request_profile,
        "expectation_suite_name": suite_name_profile,
    }],
)
context.add_or_update_checkpoint(checkpoint=checkpoint_profile)

# API Status Validation Checkpoint
checkpoint_api = SimpleCheckpoint(
    name="status_validation_checkpoint",
    data_context=context,
    validations=[{
        "batch_request": batch_request_api,
        "expectation_suite_name": suite_name_api,
    }],
)
context.add_or_update_checkpoint(checkpoint=checkpoint_api)

# ---------------------- Step 6: Run and Evaluate Results ----------------------

result_profile = checkpoint_profile.run()
result_api = checkpoint_api.run()

def print_result(label, result):
    print(f"\n--- {label} ---")
    if result["success"]:
        print("✅ All checks passed.")
    else:
        print("❌ Some checks failed.")
    print("Validation Results Summary:")
    for res in result["run_results"].values():
        for val in res["validation_result"]["results"]:
            print(f"{val['expectation_config']['expectation_type']}: Success = {val['success']}")

print_result("JSON Product Sales Data Profiling", result_profile)
print_result("API Status Validation", result_api)

# ---------------------- Step 7: Unit Test-like Validation ----------------------

def test_output_structure(results):
    assert isinstance(results["success"], bool), "Success must be a boolean."
    assert "run_results" in results, "Missing run_results key."

test_output_structure(result_profile)
test_output_structure(result_api)


ImportError: cannot import name 'SimpleCheckpoint' from 'great_expectations.checkpoint' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/checkpoint/__init__.py)

2. Writing Validation Rules for Data Ingestion

### Define validation rules for an API data source to confirm that 'Status' field contains only predefined statuses ('Active', 'Inactive').

- Apply `expect_column_values_to_be_in_set` to check field values during data ingestion.
- Execute the validation and review any mismatches.

In [None]:
# write your code from here