# Measuring Completeness

**Activity Overview**: Evaluate data completeness by checking missing data rates and handling partially available records.

## Title: Customer Profiles

**Task**: Calculate the missing data rate for customer profiles.

**Steps**:
1. List all required fields for a complete customer profile (e.g., name, address, email,
phone number).
2. Analyze the dataset to count how many profiles have missing fields.
3. Calculate the percentage of missing data fields across all profiles.

In [2]:
# Write your code from here
import pandas as pd

def calculate_customer_profile_missing_rate(file_path, required_fields):
    """
    Calculates the missing data rate for customer profiles based on a list
    of required fields.

    Args:
        file_path (str): Path to the CSV file containing customer profiles.
        required_fields (list): A list of column names that are considered
                                required for a complete customer profile
                                (e.g., ['name', 'address', 'email', 'phone_number']).

    Returns:
        float: The overall percentage of missing data across all specified
               required fields and all customer profiles. Returns None if
               the file is not found or if any of the required fields
               are missing from the dataset.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None

    missing_cols = [field for field in required_fields if field not in df.columns]
    if missing_cols:
        print(f"Error: Required fields not found in the dataset: {missing_cols}")
        return None

    total_profiles = len(df)
    total_required_fields = len(required_fields)
    total_missing_values = df[required_fields].isnull().sum().sum()
    total_expected_values = total_profiles * total_required_fields

    if total_expected_values == 0:
        return 0.0

    overall_missing_rate = (total_missing_values / total_expected_values) * 100
    return overall_missing_rate

# Example usage:
customer_file = 'customer_profiles.csv'
required_customer_info = ['name', 'address', 'email', 'phone_number']
missing_rate = calculate_customer_profile_missing_rate(customer_file, required_customer_info)

if missing_rate is not None:
    print(f"Overall Missing Data Rate for Customer Profiles: {missing_rate:.2f}%")

    # Optional: Analyze missing profiles
    df_customers = pd.read_csv(customer_file)
    profiles_with_missing = df_customers[df_customers[required_customer_info].isnull().any(axis=1)]
    num_profiles_with_missing = len(profiles_with_missing)
    total_profiles = len(df_customers)
    percentage_profiles_with_missing = (num_profiles_with_missing / total_profiles) * 100 if total_profiles > 0 else 0
    print(f"Number of Customer Profiles with Missing Required Fields: {num_profiles_with_missing} out of {total_profiles} ({percentage_profiles_with_missing:.2f}%)")

Error: File not found at 'customer_profiles.csv'
