In [1]:
# Data Quality Metrics & Scoring Examples

import pandas as pd
import numpy as np

# ---- Task 1: Customer Dataset Scoring ----
# Example customer dataset
customer_data = {
    'customer_id': [101, 102, 103, 104, 105],
    'email': ['alice@example.com', None, 'bob@domain.com', 'carol@domain.com', 'dave@example.com'],
    'phone_number': ['123-456-7890', '234-567-8901', '345-678-9012', None, '567-890-1234'],
    'address': ['New York', 'Los Angeles', None, 'Chicago', 'Houston']
}
customer_df = pd.DataFrame(customer_data)

# Define Data Quality Metrics
def calculate_quality_scores(df):
    # Completeness Score (percentage of non-null values)
    completeness = df.notnull().mean().mean()
    
    # Uniqueness Score (percentage of unique values in each column)
    uniqueness = df.nunique().mean() / df.shape[0]
    
    # Consistency Score (whether data types match expected types)
    consistency = (df.dtypes == df.dtypes[0]).mean()  # Example check: whether all columns have the same type
    
    # Calculate overall score (simple average)
    overall_score = (completeness + uniqueness + consistency) / 3
    return completeness, uniqueness, consistency, overall_score

# Task 1: Customer dataset quality scoring
completeness, uniqueness, consistency, overall_score = calculate_quality_scores(customer_df)

print("\nTask 1: Customer Data Quality Metrics")
print(f"Completeness Score: {completeness:.2f}")
print(f"Uniqueness Score: {uniqueness:.2f}")
print(f"Consistency Score: {consistency:.2f}")
print(f"Overall Data Quality Score: {overall_score:.2f}")

# ---- Task 2: Online Shop Dataset Scoring ----
# Example online shop dataset
shop_data = {
    'product_id': [1001, 1002, 1003, 1004, 1005],
    'product_name': ['Laptop', 'Smartphone', 'Tablet', 'Headphones', 'Smartwatch'],
    'price': [1000, 500, 300, 150, None],  # Missing price for one product
    'stock': [50, 30, 10, 100, 80],
    'last_updated': ['2022-03-01', '2022-02-15', '2022-03-05', '2022-01-30', None]
}
shop_df = pd.DataFrame(shop_data)

# Define metrics for Task 2
def evaluate_shop_data(df):
    # Accuracy (percentage of valid prices, based on a simple rule: price > 0)
    accuracy = (df['price'] > 0).mean()
    
    # Timeliness (percentage of rows with non-null last_updated)
    timeliness = df['last_updated'].notnull().mean()
    
    # Integrity (no duplicates in product IDs)
    integrity = df['product_id'].duplicated().sum() == 0  # Integrity: no duplicate product IDs
    
    # Calculate overall score (simple average)
    overall_score = (accuracy + timeliness + integrity) / 3
    return accuracy, timeliness, integrity, overall_score

# Task 2: Online shop dataset quality scoring
accuracy, timeliness, integrity, overall_score_shop = evaluate_shop_data(shop_df)

print("\nTask 2: Online Shop Data Quality Metrics")
print(f"Accuracy Score: {accuracy:.2f}")
print(f"Timeliness Score: {timeliness:.2f}")
print(f"Integrity Score: {integrity:.2f}")
print(f"Overall Data Quality Score: {overall_score_shop:.2f}")

# ---- Task 3: Financial Dataset Scoring ----
# Example financial dataset
financial_data = {
    'transaction_id': [10001, 10002, 10003, 10004, 10005],
    'amount': [2000, 1500, None, 1800, 2100],
    'date': ['2022-04-01', '2022-05-01', '2022-06-01', None, '2022-07-01'],
    'account_id': [501, 502, 503, 504, 505]
}
financial_df = pd.DataFrame(financial_data)

# Define metrics for Task 3
def evaluate_financial_data(df):
    # Validity (percentage of non-null amounts)
    validity = df['amount'].notnull().mean()
    
    # Precision (percentage of correct date formats)
    try:
        df['date'] = pd.to_datetime(df['date'], errors='raise')  # Convert date column
        precision = 1  # All dates valid
    except:
        precision = 0  # Some dates are invalid
    
    # Accessibility (percentage of non-null account_ids)
    accessibility = df['account_id'].notnull().mean()
    
    # Calculate overall score (simple average)
    overall_score = (validity + precision + accessibility) / 3
    return validity, precision, accessibility, overall_score

# Task 3: Financial dataset quality scoring
validity, precision, accessibility, overall_score_financial = evaluate_financial_data(financial_df)

print("\nTask 3: Financial Data Quality Metrics")
print(f"Validity Score: {validity:.2f}")
print(f"Precision Score: {precision:.2f}")
print(f"Accessibility Score: {accessibility:.2f}")
print(f"Overall Data Quality Score: {overall_score_financial:.2f}")



Task 1: Customer Data Quality Metrics
Completeness Score: 0.85
Uniqueness Score: 0.85
Consistency Score: 0.25
Overall Data Quality Score: 0.65

Task 2: Online Shop Data Quality Metrics
Accuracy Score: 0.80
Timeliness Score: 0.80
Integrity Score: 1.00
Overall Data Quality Score: 0.87

Task 3: Financial Data Quality Metrics
Validity Score: 0.80
Precision Score: 1.00
Accessibility Score: 1.00
Overall Data Quality Score: 0.93


  consistency = (df.dtypes == df.dtypes[0]).mean()  # Example check: whether all columns have the same type
