## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [1]:
# Write a conceptual framework described in Python pseudo-code:
# Conceptual Data Quality Measurement Framework (Python Pseudo-code)

# Assume we have a dataset represented as a Pandas DataFrame or a list of dictionaries

FRAMEWORK_NAME = "Simple ISO 8000 Data Quality Framework"
ISO_8000_PRINCIPLES = [
    "Completeness",
    "Accuracy",
    "Consistency",
    "Validity",
    "Timeliness",
    "Uniqueness"
    # Add other relevant ISO 8000 principles or data quality dimensions
]

def load_data(data_source):
    """
    Loads data from a specified source.
    Args:
        data_source: Path to a file, database connection, or data structure.
    Returns:
        dataset: The loaded dataset (e.g., Pandas DataFrame, list of dictionaries).
    """
    # Implementation details for loading data (e.g., pd.read_csv, database query)
    pass

def measure_completeness(dataset, column_name):
    """
    Measures the completeness of a specific column in the dataset.
    Returns:
        completeness_score (float): Percentage of non-missing values (0-100).
    """
    # Implementation using Pandas or other data manipulation libraries
    pass

def measure_accuracy(dataset, column_name, expected_values=None, validation_rules=None):
    """
    Measures the accuracy of a specific column against expected values or rules.
    Returns:
        accuracy_score (float): Percentage of accurate values (0-100).
    """
    # Implementation involving comparisons or rule-based validation
    pass

def measure_consistency(dataset, column_name1, column_name2, consistency_rule):
    """
    Measures the consistency between two or more columns based on a defined rule.
    Returns:
        consistency_score (float): Percentage of consistent values (0-100).
    """
    # Implementation applying the consistency rule and calculating the score
    pass

def measure_validity(dataset, column_name, data_type, format_rules=None):
    """
    Measures the validity of a specific column based on its data type and format.
    Returns:
        validity_score (float): Percentage of valid values (0-100).
    """
    # Implementation checking data types and applying format rules
    pass

def measure_timeliness(dataset, timestamp_column, reference_time=None, acceptable_delay=None):
    """
    Measures the timeliness of the data based on timestamps.
    Returns:
        timeliness_score (float): Score based on how recent the data is (0-100).
    """
    # Implementation involving date/time comparisons
    pass

def measure_uniqueness(dataset, column_name):
    """
    Measures the uniqueness of values in a specific column.
    Returns:
        uniqueness_score (float): Percentage of unique values (0-100).
    """
    # Implementation to identify and count duplicate values
    pass

def assess_data_quality(dataset):
    """
    Orchestrates the data quality assessment process.
    Args:
        dataset: The input dataset.
    Returns:
        quality_report (dict): A dictionary containing the data quality scores for each dimension.
    """
    quality_report = {}

    # Define which columns and rules to apply for each dimension
    completeness_config = {"column": "product_name"}
    accuracy_config = {"column": "price", "validation": lambda x: x > 0}
    consistency_config = {"column1": "order_date", "column2": "ship_date", "rule": lambda order, ship: ship >= order}
    validity_config = {"column": "email", "type": "string", "format": "email"}
    timeliness_config = {"timestamp_column": "last_updated", "reference": "now", "delay": "30 days"}
    uniqueness_config = {"column": "customer_id"}

    # Measure each data quality dimension
    if "Completeness" in ISO_8000_PRINCIPLES:
        quality_report["Completeness"] = measure_completeness(dataset, completeness_config["column"])

    if "Accuracy" in ISO_8000_PRINCIPLES:
        quality_report["Accuracy"] = measure_accuracy(dataset, accuracy_config["column"], validation_rules=accuracy_config["validation"])

    if "Consistency" in ISO_8000_PRINCIPLES:
        quality_report["Consistency"] = measure_consistency(dataset, consistency_config["column1"], consistency_config["column2"], consistency_config["rule"])

    if "Validity" in ISO_8000_PRINCIPLES:
        quality_report["Validity"] = measure_validity(dataset, validity_config["column"], validity_config["type"], format_rules=validity_config["format"])

    if "Timeliness" in ISO_8000_PRINCIPLES:
        quality_report["Timeliness"] = measure_timeliness(dataset, timeliness_config["timestamp_column"], reference_time=timeliness_config["reference"], acceptable_delay=timeliness_config["delay"])

    if "Uniqueness" in ISO_8000_PRINCIPLES:
        quality_report["Uniqueness"] = measure_uniqueness(dataset, uniqueness_config["column"])

    return quality_report

def generate_report(quality_report):
    """
    Generates a human-readable report of the data quality assessment.
    Args:
        quality_report (dict): The dictionary of data quality scores.
    """
    print(f"--- {FRAMEWORK_NAME} Report ---")
    for dimension, score in quality_report.items():
        print(f"{dimension}: {score:.2f}%")
    # Optionally, include thresholds and flags for each dimension

# --- Main Execution ---
if __name__ == "__main__":
    # Simulate loading data
    data = [
        {"product_name": "Laptop", "price": 1200, "order_date": "2025-05-15", "ship_date": "2025-05-18", "email": "user1@example.com", "last_updated": "2025-05-17", "customer_id": 1},
        {"product_name": "Mouse", "price": 25, "order_date": "2025-05-10", "ship_date": "2025-05-12", "email": "user2@example.com", "last_updated": "2025-04-20", "customer_id": 2},
        {"product_name": None, "price": -5, "order_date": "2025-05-16", "ship_date": "2025-05-15", "email": "invalid-email", "last_updated": "2025-05-18", "customer_id": 1},
        {"product_name": "Keyboard", "price": 75, "order_date": "2025-05-01", "ship_date": "2025-05-01", "email": "user4@example.co.uk", "last_updated": "2025-05-18", "customer_id": 3},
        {"product_name": "Monitor", "price": 300, "order_date": "2025-05-18", "ship_date": "2025-05-19", "email": "user5@example.com", "last_updated": "2025-05-18", "customer_id": 4},
        {"product_name": "Mouse", "price": 30, "order_date": "2025-05-10", "ship_date": "2025-05-11", "email": "user2@example.com", "last_updated": "2025-05-16", "customer_id": 2},
    ]
    dataset = data # In a real scenario, you would use load_data()

    # Assess data quality
    quality_report = assess_data_quality(dataset)

    # Generate a report
    generate_report(quality_report)

--- Simple ISO 8000 Data Quality Framework Report ---


TypeError: unsupported format string passed to NoneType.__format__