In [1]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Customer Dataset
# - Load a custom CSV file named customer_data.csv .
# - Find any missing values in specific columns like 'Email' and 'Phone'.






# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Emails in Customer Dataset
# - Identify duplicate emails which might indicate duplicate customer records.







# Part 3: Generate a Data Quality Report

# Task 3: Customer Dataset Report
# - Summarize the data quality with missing values, duplicates, and inconsistencies for customer_data.csv .

import pandas as pd

# Part 1: Load Customer Dataset and Check Missing Values
# ----------------------------------------------------
print("="*50)
print("PART 1: LOADING CUSTOMER DATA & MISSING VALUES CHECK")
print("="*50)

try:
    # Load customer data (replace with your actual file path)
    customer_df = pd.read_csv('customer_data.csv')
    
    # Check for missing values in key columns
    missing_values = customer_df[['Email', 'Phone']].isnull().sum()
    missing_pct = (customer_df[['Email', 'Phone']].isnull().mean() * 100).round(1)
    
    print("\nMissing Values in Key Columns:")
    print(pd.DataFrame({
        'Missing Count': missing_values,
        'Percentage (%)': missing_pct
    }))
    
except FileNotFoundError:
    print("\nError: customer_data.csv not found. Using sample data instead.")
    # Create sample data if file not found
    data = {
        'CustomerID': [101, 102, 103, 104, 105, 106, 107],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
        'Email': ['alice@example.com', 'bob@example.com', None, 'david@example.com', 
                 'eve@example.com', 'frank@example.com', None],
        'Phone': ['1234567890', None, '2345678901', '3456789012', 
                 None, '4567890123', '5678901234'],
        'JoinDate': ['2023-01-01', '2023-02-15', '2023-03-10', 
                    '2023-04-05', None, '2023-06-01', '2023-07-15']
    }
    customer_df = pd.DataFrame(data)
    print("\nSample Customer Data Created:")
    print(customer_df.head())

# Part 2: Identify Duplicate Emails
# --------------------------------
print("\n" + "="*50)
print("PART 2: DUPLICATE EMAIL CHECK")
print("="*50)

# Find duplicate emails (excluding null values)
email_duplicates = customer_df[customer_df.duplicated(subset=['Email'], keep=False) & 
                              customer_df['Email'].notna()]

if not email_duplicates.empty:
    print("\nDuplicate Email Records Found:")
    print(email_duplicates.sort_values('Email'))
else:
    print("\nNo duplicate emails found.")

# Part 3: Generate Data Quality Report
# -----------------------------------
print("\n" + "="*50)
print("PART 3: CUSTOMER DATA QUALITY REPORT")
print("="*50)

def generate_customer_report(df):
    """Generate comprehensive data quality report"""
    
    report = {
        'overview': {
            'Total Customers': len(df),
            'Total Columns': len(df.columns),
            'Duplicate Emails': df.duplicated(subset=['Email']).sum(),
            'Complete Records': df.notna().all(axis=1).sum()
        },
        'missing_values': df.isnull().sum().sort_values(ascending=False),
        'data_types': df.dtypes,
        'email_analysis': {
            'Unique Emails': df['Email'].nunique(),
            'Null Emails': df['Email'].isnull().sum()
        },
        'phone_analysis': {
            'Unique Phone Numbers': df['Phone'].nunique(),
            'Null Phone Numbers': df['Phone'].isnull().sum()
        }
    }
    return report

# Generate and display report
report = generate_customer_report(customer_df)

# Print overview
print("\nDATASET OVERVIEW:")
for k, v in report['overview'].items():
    print(f"{k:<20}: {v}")

# Print missing values
print("\nMISSING VALUES BY COLUMN:")
print(report['missing_values'][report['missing_values'] > 0])

# Print data types
print("\nDATA TYPES:")
print(report['data_types'])

# Print email analysis
print("\nEMAIL ANALYSIS:")
for k, v in report['email_analysis'].items():
    print(f"{k:<20}: {v}")

# Print phone analysis
print("\nPHONE ANALYSIS:")
for k, v in report['phone_analysis'].items():
    print(f"{k:<20}: {v}")

# Optional: Save report to file
try:
    with open('customer_data_quality_report.txt', 'w') as f:
        f.write("CUSTOMER DATA QUALITY REPORT\n")
        f.write("="*50 + "\n")
        f.write("\nOVERVIEW:\n")
        for k, v in report['overview'].items():
            f.write(f"{k:<20}: {v}\n")
        f.write("\nMISSING VALUES:\n")
        f.write(str(report['missing_values'][report['missing_values'] > 0]))
    print("\nReport saved to 'customer_data_quality_report.txt'")
except Exception as e:
    print(f"\nCould not save report: {str(e)}")





PART 1: LOADING CUSTOMER DATA & MISSING VALUES CHECK

Error: customer_data.csv not found. Using sample data instead.

Sample Customer Data Created:
   CustomerID     Name              Email       Phone    JoinDate
0         101    Alice  alice@example.com  1234567890  2023-01-01
1         102      Bob    bob@example.com        None  2023-02-15
2         103  Charlie               None  2345678901  2023-03-10
3         104    David  david@example.com  3456789012  2023-04-05
4         105      Eve    eve@example.com        None        None

PART 2: DUPLICATE EMAIL CHECK

No duplicate emails found.

PART 3: CUSTOMER DATA QUALITY REPORT

DATASET OVERVIEW:
Total Customers     : 7
Total Columns       : 5
Duplicate Emails    : 1
Complete Records    : 3

MISSING VALUES BY COLUMN:
Email       2
Phone       2
JoinDate    1
dtype: int64

DATA TYPES:
CustomerID     int64
Name          object
Email         object
Phone         object
JoinDate      object
dtype: object

EMAIL ANALYSIS:
Unique Emails