### Task 1: Understanding and Defining Data Quality Metrics
**Description**: Learn how to define basic data quality metrics such as completeness, validity, and uniqueness for a simple dataset.

**Steps**:
1. Dataset: Use a CSV with columns like Name , Email , Age .
2. Metric Definitions:
    - Completeness: Percentage of non-null values.
    - Validity: % of email fields containing @ .
    - Uniqueness: Count distinct entries in the Email column.

In [14]:
import pandas as pd
from io import StringIO

# CSV data as a string
csv_data = """Name,Email,Age
Alice,alice@example.com,30
Bob,bob@example,25
Charlie,,35
David,david@example.com,
Eve,eve@example.com,29
Frank,frankexample.com,40
"""

# Load CSV data into DataFrame
data = StringIO(csv_data)
df = pd.read_csv(data)

# Display the DataFrame
print("Data:")
print(df)

# 1. Completeness: Percentage of non-null values per column
completeness = df.notnull().mean() * 100

# 2. Validity: Percentage of email fields containing '@'
valid_emails = df['Email'].dropna().apply(lambda x: '@' in x)
validity = valid_emails.mean() * 100

# 3. Uniqueness: Count distinct emails
uniqueness = df['Email'].nunique()

# Print results
print("\nData Quality Metrics:")
print(f"Completeness (% non-null values):\n{completeness}")
print(f"\nValidity (% emails containing '@'): {validity:.2f}%")
print(f"Uniqueness (distinct emails): {uniqueness}")


Data:
      Name              Email   Age
0    Alice  alice@example.com  30.0
1      Bob        bob@example  25.0
2  Charlie                NaN  35.0
3    David  david@example.com   NaN
4      Eve    eve@example.com  29.0
5    Frank   frankexample.com  40.0

Data Quality Metrics:
Completeness (% non-null values):
Name     100.000000
Email     83.333333
Age       83.333333
dtype: float64

Validity (% emails containing '@'): 80.00%
Uniqueness (distinct emails): 5


### Task 2: Calculating Data Quality Score
**Description**: Aggregate multiple metrics to calculate an overall data quality score.

**Steps**:
1. Formula: Simple average of all metrics defined in Task 1.

In [15]:
import pandas as pd
from io import StringIO

# CSV data as a string
csv_data = """Name,Email,Age
Alice,alice@example.com,30
Bob,bob@example,25
Charlie,,35
David,david@example.com,
Eve,eve@example.com,29
Frank,frankexample.com,40
"""

# Load CSV data into DataFrame
data = StringIO(csv_data)
df = pd.read_csv(data)

# 1. Completeness: Percentage of non-null values per column
completeness = df.notnull().mean() * 100

# 2. Validity: Percentage of email fields containing '@'
valid_emails = df['Email'].dropna().apply(lambda x: '@' in x)
validity = valid_emails.mean() * 100

# 3. Uniqueness: Percentage of unique emails (unique count / total non-null emails * 100)
total_non_null_emails = df['Email'].notnull().sum()
unique_emails = df['Email'].nunique()
uniqueness = (unique_emails / total_non_null_emails) * 100 if total_non_null_emails > 0 else 0

# Print individual metrics
print("Data Quality Metrics:")
print(f"Completeness (% non-null values):\n{completeness}")
print(f"Validity (% emails containing '@'): {validity:.2f}%")
print(f"Uniqueness (% unique emails): {uniqueness:.2f}%")

# 4. Calculate overall Data Quality Score (simple average)
# Average completeness across all columns, then average with validity and uniqueness
average_completeness = completeness.mean()
data_quality_score = (average_completeness + validity + uniqueness) / 3

print(f"\nOverall Data Quality Score: {data_quality_score:.2f}%")


Data Quality Metrics:
Completeness (% non-null values):
Name     100.000000
Email     83.333333
Age       83.333333
dtype: float64
Validity (% emails containing '@'): 80.00%
Uniqueness (% unique emails): 100.00%

Overall Data Quality Score: 89.63%


### Task 3: Creating Expectations for a CSV
**Description**: Develop basic data quality expectations using Great Expectations.

**Steps**:
1. Expectation Suite
2. Define Expectations for Completeness

In [16]:
import great_expectations as ge
import pandas as pd
from io import StringIO

# Sample CSV data as string
csv_data = """Name,Email,Age
Alice,alice@example.com,30
Bob,bob@example,25
Charlie,,35
David,david@example.com,
Eve,eve@example.com,29
Frank,frankexample.com,40
"""

# Load data into pandas DataFrame
data = StringIO(csv_data)
df = pd.read_csv(data)

# Convert to a Great Expectations Dataset
ge_df = ge.from_pandas(df)

# Create an expectation suite (you can name it anything)
suite_name = "basic_completeness_suite"

# Create the expectation suite (if not exists)
ge_df.create_expectation_suite(suite_name, overwrite_existing=True)

# Define completeness expectations: no missing values in 'Name' and 'Email' columns
ge_df.expect_column_values_to_not_be_null("Name")
ge_df.expect_column_values_to_not_be_null("Email")

# Optionally, check that 'Age' column can have nulls, so no expectation for it

# Validate the dataset against the expectation suite
results = ge_df.validate()

# Print the results summary
print(results)

# If you want, save the expectation suite to JSON file
ge_df.save_expectation_suite(suite_name + ".json")


ModuleNotFoundError: No module named 'numpy.char'

### Task 4: Running and Validating Expectations
**Description**: Run the created expectations and generate an output report.

**Steps**:
1. Validate
2. Generate HTML Report

In [None]:
import great_expectations as ge
import pandas as pd
from io import StringIO

# Sample CSV data
csv_data = """Name,Email,Age
Alice,alice@example.com,30
Bob,bob@example,25
Charlie,,35
David,david@example.com,
Eve,eve@example.com,29
Frank,frankexample.com,40
"""

# Load data into pandas DataFrame
data = StringIO(csv_data)
df = pd.read_csv(data)

# Convert to a Great Expectations Dataset
ge_df = ge.from_pandas(df)

# Expectation Suite name
suite_name = "basic_completeness_suite"

# Create or overwrite the expectation suite
ge_df.create_expectation_suite(suite_name, overwrite_existing=True)

# Define completeness expectations for 'Name' and 'Email'
ge_df.expect_column_values_to_not_be_null("Name")
ge_df.expect_column_values_to_not_be_null("Email")

# Run validation (this will check the data against the expectations)
validation_result = ge_df.validate()

# Print summary of validation results
print(validation_result)

# Save the expectation suite to JSON file
suite_filepath = suite_name + ".json"
ge_df.save_expectation_suite(suite_filepath)

# Generate an HTML validation report
from great_expectations.render.renderer import ValidationResultsPageRenderer
from great_expectations.render.view import DefaultJinjaPageView
from great_expectations.data_context import DataContext

# Initialize DataContext (using in-memory config)
context = ge.data_context.DataContext()

# Render the validation result to HTML
renderer = ValidationResultsPageRenderer()
view = DefaultJinjaPageView()

# Render the HTML content
rendered_html = renderer.render(validation_result)
html_output = view.render(rendered_html)

# Save the report to an HTML file
with open("validation_report.html", "w") as f:
    f.write(html_output)

print("HTML validation report generated: validation_report.html")


ModuleNotFoundError: No module named 'great_expectations'

### Task 5: Automating Data Quality Score Calculation
**Description**: Automate the data quality score via a script that integrates with Great
Expectations.

In [None]:
import great_expectations as ge
import pandas as pd
from io import StringIO

# Sample CSV data (replace or load your CSV file as needed)
csv_data = """Name,Email,Age
Alice,alice@example.com,30
Bob,bob@example,25
Charlie,,35
David,david@example.com,
Eve,eve@example.com,29
Frank,frankexample.com,40
"""

# Load data into pandas DataFrame
data = StringIO(csv_data)
df = pd.read_csv(data)

# Convert to Great Expectations Dataset
ge_df = ge.from_pandas(df)

def automate_dqs(dataframe):
    # Define expectation suite name
    suite_name = "auto_dqs_suite"
    
    # Create or overwrite expectation suite
    dataframe.create_expectation_suite(suite_name, overwrite_existing=True)
    
    # Define completeness expectations for columns
    dataframe.expect_column_values_to_not_be_null("Name")
    dataframe.expect_column_values_to_not_be_null("Email")
    dataframe.expect_column_values_to_not_be_null("Age")
    
    # Run validation
    validation_result = dataframe.validate()
    
    # Extract the number of successful expectations and total expectations
    results = validation_result["results"]
    total_expectations = len(results)
    passed_expectations = sum([1 for r in results if r["success"]])
    
    # Calculate Data Quality Score as the ratio of passed expectations
    dqs = passed_expectations / total_expectations if total_expectations > 0 else 0.0
    
    print(f"Data Quality Score (DQS): {dqs:.2f}")
    
    return dqs

# Run the automation function
automate_dqs(ge_df)


ModuleNotFoundError: No module named 'great_expectations'

### Task 6: Leveraging Data Quality Metrics for Automated Data Cleaning
**Description**: Implement a system where if data quality metrics fall below a threshold,
automated data cleaning scripts are triggered.

**Steps**:
1. Define Cleaning Logic
2. Integrate with Great Expectations:
    - Use an action within the Great Expectations action list that only triggers if quality score is below a threshold, automating the cleaning.

In [None]:
import great_expectations as ge
import pandas as pd
from io import StringIO

# Sample CSV data
csv_data = """Name,Email,Age
Alice,alice@example.com,30
Bob,bob@example,25
Charlie,,35
David,david@example.com,
Eve,eve@example.com,29
Frank,frankexample.com,40
"""

# Load data into pandas DataFrame
data = StringIO(csv_data)
df = pd.read_csv(data)

# Convert to Great Expectations Dataset
ge_df = ge.from_pandas(df)

def automate_dqs(dataframe):
    dataframe.create_expectation_suite("auto_clean_suite", overwrite_existing=True)
    dataframe.expect_column_values_to_not_be_null("Name")
    dataframe.expect_column_values_to_not_be_null("Email")
    dataframe.expect_column_values_to_not_be_null("Age")
    validation_result = dataframe.validate()
    results = validation_result["results"]
    total = len(results)
    passed = sum([1 for r in results if r["success"]])
    dqs = passed / total if total > 0 else 0.0
    return dqs, dataframe

def clean_data(dataframe):
    print("Starting automated cleaning...")

    # Cleaning logic:
    # 1. Fill missing Age with median age
    median_age = dataframe["Age"].median()
    dataframe["Age"] = dataframe["Age"].fillna(median_age)

    # 2. Fill missing Email with placeholder
    dataframe["Email"] = dataframe["Email"].fillna("unknown@example.com")

    # 3. Remove rows with invalid email (simple check: contains '@')
    dataframe = dataframe[dataframe["Email"].str.contains("@")]

    # 4. Remove rows with missing Name (drop if any)
    dataframe = dataframe.dropna(subset=["Name"])

    print("Cleaning completed.")
    return dataframe

def main():
    dqs, ge_df_local = automate_dqs(ge_df)
    threshold = 0.8
    print(f"Data Quality Score: {dqs:.2f}")

    if dqs < threshold:
        print(f"DQS below threshold ({threshold}), triggering cleaning...")
        cleaned_df = clean_data(ge_df_local)
        
        # Re-run validation on cleaned data
        cleaned_ge_df = ge.from_pandas(cleaned_df)
        cleaned_dqs, _ = automate_dqs(cleaned_ge_df)
        print(f"Post-cleaning Data Quality Score: {cleaned_dqs:.2f}")
    else:
        print("Data quality is above threshold, no cleaning needed.")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'great_expectations'