In [1]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.





# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.


import pandas as pd

# =======================
# Task 1: Assign Scores to Customer Dataset
# =======================

# Function to calculate completeness, uniqueness, and consistency
def data_quality_task1():
    # Example customer dataset
    customer_data = {'customer_id': [101, 102, 103, 104, 105],
                     'name': ['Alice', 'Bob', 'Charlie', 'David', None],
                     'email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com', ''],
                     'age': [25, 30, None, 40, 35]}
    
    df_customer = pd.DataFrame(customer_data)
    
    # Calculate completeness (percentage of non-null values)
    completeness = df_customer.notnull().mean().mean()
    
    # Calculate uniqueness (percentage of unique values)
    uniqueness = df_customer.nunique().mean() / len(df_customer)
    
    # Calculate consistency (all data should be in a consistent format)
    # In this case, we'll assume consistency is based on whether the 'email' and 'age' fields are non-null
    consistency = df_customer[['email', 'age']].notnull().mean().mean()

    # Calculate overall data quality score (average of the three metrics)
    data_quality_score = (completeness + uniqueness + consistency) / 3
    
    print("Task 1: Customer Dataset Data Quality Score:")
    print(f"Completeness: {completeness:.2f}")
    print(f"Uniqueness: {uniqueness:.2f}")
    print(f"Consistency: {consistency:.2f}")
    print(f"Overall Data Quality Score: {data_quality_score:.2f}")
    
    return data_quality_score

# =======================
# Task 2: Evaluate Online Shop Dataset
# =======================

# Function to evaluate accuracy, timeliness, and integrity
def data_quality_task2():
    # Example dataset for an online shop
    online_shop_data = {'product_id': [1, 2, 3, 4],
                        'product_name': ['T-shirt', 'Jeans', 'Sneakers', 'Hat'],
                        'price': [19.99, 39.99, 49.99, None],
                        'last_updated': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-02-01']}
    
    df_shop = pd.DataFrame(online_shop_data)
    
    # Calculate accuracy (percentage of correct price values, excluding None)
    accuracy = df_shop['price'].notnull().mean()
    
    # Calculate timeliness (percentage of records with a valid last_updated date within last 30 days)
    timeliness = (pd.to_datetime(df_shop['last_updated']) > pd.to_datetime('today') - pd.Timedelta(days=30)).mean()
    
    # Calculate integrity (percentage of non-null product names)
    integrity = df_shop['product_name'].notnull().mean()

    # Calculate overall data quality score
    data_quality_score = (accuracy + timeliness + integrity) / 3
    
    print("\nTask 2: Online Shop Dataset Data Quality Score:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Timeliness: {timeliness:.2f}")
    print(f"Integrity: {integrity:.2f}")
    print(f"Overall Data Quality Score: {data_quality_score:.2f}")
    
    return data_quality_score

# =======================
# Task 3: Evaluate Financial Dataset
# =======================

# Function to assess validity, precision, and accessibility
def data_quality_task3():
    # Example financial dataset
    financial_data = {'transaction_id': [1, 2, 3, 4],
                      'amount': [1000.50, 2500.75, None, 1500.00],
                      'valid': [True, True, False, True],
                      'transaction_date': ['2023-01-10', '2023-01-15', '2023-02-01', '2023-01-20']}
    
    df_financial = pd.DataFrame(financial_data)
    
    # Calculate validity (percentage of valid transactions)
    validity = df_financial['valid'].mean()
    
    # Calculate precision (percentage of transactions with a valid 'amount' field)
    precision = df_financial['amount'].notnull().mean()
    
    # Calculate accessibility (percentage of non-null transaction dates)
    accessibility = df_financial['transaction_date'].notnull().mean()

    # Calculate overall data quality score
    data_quality_score = (validity + precision + accessibility) / 3
    
    print("\nTask 3: Financial Dataset Data Quality Score:")
    print(f"Validity: {validity:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Accessibility: {accessibility:.2f}")
    print(f"Overall Data Quality Score: {data_quality_score:.2f}")
    
    return data_quality_score

# =======================
# Run All Tasks
# =======================

def run_all_data_quality_tasks():
    score_task1 = data_quality_task1()  # Task 1: Customer dataset
    score_task2 = data_quality_task2()  # Task 2: Online shop dataset
    score_task3 = data_quality_task3()  # Task 3: Financial dataset

    # Average the data quality scores of all tasks
    overall_score = (score_task1 + score_task2 + score_task3) / 3
    print(f"\nOverall Data Quality Score: {overall_score:.2f}")
    
run_all_data_quality_tasks()



Task 1: Customer Dataset Data Quality Score:
Completeness: 0.90
Uniqueness: 0.90
Consistency: 0.90
Overall Data Quality Score: 0.90

Task 2: Online Shop Dataset Data Quality Score:
Accuracy: 0.75
Timeliness: 0.00
Integrity: 1.00
Overall Data Quality Score: 0.58

Task 3: Financial Dataset Data Quality Score:
Validity: 0.75
Precision: 0.75
Accessibility: 1.00
Overall Data Quality Score: 0.83

Overall Data Quality Score: 0.77
