In [1]:
import pandas as pd
import great_expectations as ge
from great_expectations.core.batch import BatchRequest
from great_expectations.data_context import FileDataContext
import os

# Sample dataset
csv_path = "sample.csv"
df = pd.DataFrame({
    "Name": ["Alice", "Bob", None, "David"],
    "Email": ["alice@example.com", "bob[at]mail.com", "charlie@example.com", None],
    "Age": [25, 30, None, 40]
})
df.to_csv(csv_path, index=False)

# Task 1: Metric Calculations
def calculate_metrics(df):
    completeness = df.notnull().mean().mean() * 100
    validity = df['Email'].dropna().str.contains("@").mean() * 100
    uniqueness = df['Email'].nunique() / len(df) * 100
    return completeness, validity, uniqueness

# Task 2: Data Quality Score
def calculate_quality_score(completeness, validity, uniqueness):
    return (completeness + validity + uniqueness) / 3

# Task 3: Create Expectations
context = ge.get_context()
expectation_suite_name = "dq_suite"

if not context.suites.get_expectation_suite(expectation_suite_name):
    context.add_expectation_suite(expectation_suite_name)

datasource_name = "dq_csv"
if datasource_name not in context.list_datasources():
    context.sources.add_pandas(datasource_name=datasource_name)

validator = context.sources.pandas_default.read_csv(csv_path)
validator.expect_column_values_to_not_be_null("Name")
validator.expect_column_values_to_not_be_null("Email")
validator.expect_column_values_to_match_regex("Email", r".+@.+\\..+")
context.save_expectation_suite(validator.get_expectation_suite(), expectation_suite_name)

# Task 4: Validation and Report
docs_path = os.path.join(context.root_directory, "uncommitted", "data_docs")
checkpoint = context.add_or_update_checkpoint(
    name="dq_checkpoint",
    validations=[
        {
            "batch_request": {
                "runtime_parameters": {"path": csv_path},
                "datasource_name": "dq_csv",
                "data_connector_name": "default_inferred_data_connector_name",
                "data_asset_name": "sample",
                "runtime_parameters": {"batch_data": df},
                "batch_identifiers": {"default_identifier_name": "default_id"},
            },
            "expectation_suite_name": expectation_suite_name,
        }
    ]
)
checkpoint_result = checkpoint.run()
context.build_data_docs()
print(f"Report generated at: {docs_path}")

# Task 5: Automate Metric + Quality Score
completeness, validity, uniqueness = calculate_metrics(df)
score = calculate_quality_score(completeness, validity, uniqueness)
print(f"Data Quality Score: {score:.2f}%")

# Task 6: Automated Cleaning
THRESHOLD = 80
if score < THRESHOLD:
    print("Score below threshold. Triggering cleaning script...")
    df['Name'].fillna("Unknown", inplace=True)
    df['Email'] = df['Email'].apply(lambda x: x if pd.notnull(x) and '@' in x else 'unknown@example.com')
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df.to_csv("cleaned_sample.csv", index=False)
    print("Cleaned data saved to 'cleaned_sample.csv'")
else:
    print("Data quality is acceptable. No cleaning necessary.")


AttributeError: 'SuiteFactory' object has no attribute 'get_expectation_suite'