## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [2]:
import pandas as pd
import re
from datetime import datetime
from io import StringIO

csv_data = """CustomerID,Email,Country,LastOrderDate,Status
1,john.doe@example.com,US,2025-05-10,Active
2,jane.smith@example.co.uk,GB,2025-05-05,Inactive
3,alice.wonder@example.in,IN,2025-05-12,Active
4,bob.builder@example.ca,CA,2025-04-28,Active
5,charlie.brown@example.com.au,AU,2025-05-01,Inactive
6,david.jones@example.us,US,2025-05-15,Active
7,eve.adams@example.de,DE,2025-05-08,Inactive
8,frank.zappa@example.fr,FR,2025-05-11,Active
9,grace.hopper@example.jp,JP,2025-05-03,Inactive
10,heidi.klum@example.ch,CH,2025-05-14,Active
11,john.doe@example.com,US,,Active
12,peter.pan@no-email.net,,GB,Inactive
13,wendy.darling@example.in,IN,2025-05-13,
14,captain.hook@example.ca,CA,2025-05-02,Active
15,tinker.bell@example.com.au,AU,invalid-date,Inactive
"""

# Load the CSV data from the string into a Pandas DataFrame
df = pd.read_csv(StringIO(csv_data))

# --- Data Quality Metrics ---

# Accuracy: Email format validation
email_regex = r"[^@]+@[^@]+\.[^@]+"
df['is_valid_email'] = df['Email'].astype(str).apply(lambda x: bool(re.fullmatch(email_regex, x)))
accuracy_email = df['is_valid_email'].mean() * 100
print(f"Accuracy (Valid Email Format): {accuracy_email:.2f}%")

# Accuracy: LastOrderDate is a valid date (after attempting conversion)
df['LastOrderDate'] = pd.to_datetime(df['LastOrderDate'], errors='coerce')
accuracy_valid_date = (df['LastOrderDate'].notna().sum() / len(df)) * 100
print(f"Accuracy (Valid Last Order Date): {accuracy_valid_date:.2f}%")

# Completeness: Email completeness for all customers
completeness_email = (df['Email'].notna().sum() / len(df)) * 100
print(f"Completeness (Email): {completeness_email:.2f}%")

# Completeness: LastOrderDate completeness
completeness_last_order_date = (df['LastOrderDate'].notna().sum() / len(df)) * 100
print(f"Completeness (Last Order Date): {completeness_last_order_date:.2f}%")

# Consistency: Valid Country Codes (assuming a list of valid codes)
valid_countries = ['US', 'GB', 'IN', 'CA', 'AU', 'DE', 'FR', 'JP', 'CH']
df['is_valid_country'] = df['Country'].isin(valid_countries)
consistency_country = df['is_valid_country'].mean() * 100
print(f"Consistency (Valid Country Codes): {consistency_country:.2f}%")

# Consistency: Last Order Date not in future for active customers
now = datetime.now()
df_active = df[df['Status'] == 'Active'].copy() # Use .copy() to avoid SettingWithCopyWarning
if not df_active.empty:
    consistency_future_order = ((df_active['LastOrderDate'] <= now).sum() / len(df_active)) * 100
    print(f"Consistency (Future Order Date for Active Customers): {consistency_future_order:.2f}%")
else:
    print("No active customers found to check future order dates.")

# Note: Timeliness is harder to assess with static data.
# It usually involves tracking when data was created or updated.

# --- You would typically have more sophisticated timeliness checks
# --- that might involve comparing timestamps or tracking data availability.

Accuracy (Valid Email Format): 100.00%
Accuracy (Valid Last Order Date): 80.00%
Completeness (Email): 100.00%
Completeness (Last Order Date): 80.00%
Consistency (Valid Country Codes): 93.33%
Consistency (Future Order Date for Active Customers): 87.50%
