In [2]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Customer Dataset
# - Load a custom CSV file named customer_data.csv .
# - Find any missing values in specific columns like 'Email' and 'Phone'.






# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Emails in Customer Dataset
# - Identify duplicate emails which might indicate duplicate customer records.







# Part 3: Generate a Data Quality Report

# Task 3: Customer Dataset Report
# - Summarize the data quality with missing values, duplicates, and inconsistencies for customer_data.csv .

import pandas as pd

# -----------------------------------------------
# Part 1: Load Dataset & Check Missing Values
# -----------------------------------------------

# Sample customer data
data = {
    'CustomerID': [101, 102, 103, 104, 105, 106],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Email': ['alice@example.com', 'bob@example.com', None, 'alice@example.com', 'eva@example.com', 'frank@example.com'],
    'Phone': ['123-456-7890', None, '555-123-4567', '123-456-7890', '999-888-7777', None],
    'Gender': ['Female', 'Male', 'male', 'MALE', 'Female', 'female']
}

df = pd.DataFrame(data)

# Check for missing values in 'Email' and 'Phone'
missing_email = df['Email'].isnull().sum()
missing_phone = df['Phone'].isnull().sum()

print("=== Missing Values ===")
print(f"Missing 'Email': {missing_email}")
print(f"Missing 'Phone': {missing_phone}")

# --------------------------------------------------
# Part 2: Identify Duplicates & Inconsistencies
# --------------------------------------------------

# Identify duplicate email addresses
duplicate_emails = df[df.duplicated(subset='Email', keep=False)]

print("\n=== Duplicate Email Records ===")
print(duplicate_emails[['CustomerID', 'Name', 'Email']])

# --------------------------------------------------
# Part 3: Generate a Data Quality Report
# --------------------------------------------------

# Total records
total_records = len(df)

# Number of duplicate rows (across all columns)
total_duplicates = df.duplicated().sum()

# Missing values for each column
missing_values = df.isnull().sum()

# Check inconsistencies in 'Gender' column
df['Gender_cleaned'] = df['Gender'].str.strip().str.lower()
gender_variants = df['Gender_cleaned'].unique()

# Summary Report
print("\n=== Data Quality Report ===")
print(f"Total Records: {total_records}")
print(f"Total Duplicate Rows: {total_duplicates}")
print("\nMissing Values by Column:")
print(missing_values)

print("\nGender Column Inconsistencies (Normalized):")
print(gender_variants)






=== Missing Values ===
Missing 'Email': 1
Missing 'Phone': 2

=== Duplicate Email Records ===
   CustomerID   Name              Email
0         101  Alice  alice@example.com
3         104  David  alice@example.com

=== Data Quality Report ===
Total Records: 6
Total Duplicate Rows: 0

Missing Values by Column:
CustomerID    0
Name          0
Email         1
Phone         2
Gender        0
dtype: int64

Gender Column Inconsistencies (Normalized):
['female' 'male']
