In [4]:
# Data Quality Metrics & Scoring Examples

import pandas as pd
import numpy as np

# --- Performance-Optimized and Modularized Functions ---

# Function to calculate completeness, uniqueness, consistency, and overall score
def calculate_quality_scores(df):
    # Completeness Score: Percentage of non-null values in each column
    completeness = df.notnull().sum().sum() / (df.shape[0] * df.shape[1])
    
    # Uniqueness Score: Average percentage of unique values in each column
    uniqueness = df.nunique().sum() / (df.shape[0] * df.shape[1])
    
    # Consistency Score: Ensures that all columns have consistent data types (simplified check)
    consistency = (df.dtypes == df.dtypes[0]).mean()
    
    # Overall Quality Score: Average of completeness, uniqueness, and consistency
    overall_score = (completeness + uniqueness + consistency) / 3
    return completeness, uniqueness, consistency, overall_score


# Function to evaluate shop dataset quality (accuracy, timeliness, integrity)
def evaluate_shop_data(df):
    # Accuracy: Percentage of valid (positive) price entries
    accuracy = (df['price'] > 0).sum() / df.shape[0]
    
    # Timeliness: Percentage of non-null last_updated entries
    timeliness = df['last_updated'].notnull().sum() / df.shape[0]
    
    # Integrity: Check if product_id column has any duplicates
    integrity = (df['product_id'].duplicated().sum() == 0)
    
    # Overall Shop Quality Score: Average of accuracy, timeliness, and integrity
    overall_score = (accuracy + timeliness + integrity) / 3
    return accuracy, timeliness, integrity, overall_score


# Function to evaluate financial dataset quality (validity, precision, accessibility)
def evaluate_financial_data(df):
    # Validity: Percentage of non-null 'amount' entries
    validity = df['amount'].notnull().sum() / df.shape[0]
    
    # Precision: Validate date format
    try:
        df['date'] = pd.to_datetime(df['date'], errors='raise')
        precision = 1  # All dates valid
    except Exception as e:
        precision = 0  # Some dates are invalid
    
    # Accessibility: Percentage of non-null 'account_id' entries
    accessibility = df['account_id'].notnull().sum() / df.shape[0]
    
    # Overall Financial Quality Score: Average of validity, precision, and accessibility
    overall_score = (validity + precision + accessibility) / 3
    return validity, precision, accessibility, overall_score


# --- Data Examples for Task 1, Task 2, Task 3 ---

# Task 1: Customer dataset
customer_data = {
    'customer_id': [101, 102, 103, 104, 105],
    'email': ['alice@example.com', None, 'bob@domain.com', 'carol@domain.com', 'dave@example.com'],
    'phone_number': ['123-456-7890', '234-567-8901', '345-678-9012', None, '567-890-1234'],
    'address': ['New York', 'Los Angeles', None, 'Chicago', 'Houston']
}
customer_df = pd.DataFrame(customer_data)

# Task 1: Calculate and print customer data quality
completeness, uniqueness, consistency, overall_score = calculate_quality_scores(customer_df)
print("\nTask 1: Customer Data Quality Metrics")
print(f"Completeness Score: {completeness:.2f}")
print(f"Uniqueness Score: {uniqueness:.2f}")
print(f"Consistency Score: {consistency:.2f}")
print(f"Overall Data Quality Score: {overall_score:.2f}")


# Task 2: Online shop dataset
shop_data = {
    'product_id': [1001, 1002, 1003, 1004, 1005],
    'product_name': ['Laptop', 'Smartphone', 'Tablet', 'Headphones', 'Smartwatch'],
    'price': [1000, 500, 300, 150, None],  # Missing price for one product
    'stock': [50, 30, 10, 100, 80],
    'last_updated': ['2022-03-01', '2022-02-15', '2022-03-05', '2022-01-30', None]
}
shop_df = pd.DataFrame(shop_data)

# Task 2: Calculate and print online shop data quality
accuracy, timeliness, integrity, overall_score_shop = evaluate_shop_data(shop_df)
print("\nTask 2: Online Shop Data Quality Metrics")
print(f"Accuracy Score: {accuracy:.2f}")
print(f"Timeliness Score: {timeliness:.2f}")
print(f"Integrity Score: {integrity:.2f}")
print(f"Overall Data Quality Score: {overall_score_shop:.2f}")


# Task 3: Financial dataset
financial_data = {
    'transaction_id': [10001, 10002, 10003, 10004, 10005],
    'amount': [2000, 1500, None, 1800, 2100],
    'date': ['2022-04-01', '2022-05-01', '2022-06-01', None, '2022-07-01'],
    'account_id': [501, 502, 503, 504, 505]
}
financial_df = pd.DataFrame(financial_data)

# Task 3: Calculate and print financial data quality
validity, precision, accessibility, overall_score_financial = evaluate_financial_data(financial_df)
print("\nTask 3: Financial Data Quality Metrics")
print(f"Validity Score: {validity:.2f}")
print(f"Precision Score: {precision:.2f}")
print(f"Accessibility Score: {accessibility:.2f}")
print(f"Overall Data Quality Score: {overall_score_financial:.2f}")




Task 1: Customer Data Quality Metrics
Completeness Score: 0.85
Uniqueness Score: 0.85
Consistency Score: 0.25
Overall Data Quality Score: 0.65

Task 2: Online Shop Data Quality Metrics
Accuracy Score: 0.80
Timeliness Score: 0.80
Integrity Score: 1.00
Overall Data Quality Score: 0.87

Task 3: Financial Data Quality Metrics
Validity Score: 0.80
Precision Score: 1.00
Accessibility Score: 1.00
Overall Data Quality Score: 0.93


  consistency = (df.dtypes == df.dtypes[0]).mean()
