## Customer Dataset Cleaning

In [1]:
%pip install fuzzywuzzy
!python -m pip install --upgrade pip
!pip install pycountry
%pip install python-Levenshtein
%pip install pandas numpy matplotlib seaborn scikit-learn jupyter

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Activate kernal:
# .\venv_fyp\Scripts\activate

# Step 1: Import libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime, date
from fuzzywuzzy import process, fuzz
import pycountry

In [3]:
# Step 2: Load dataset
# Replace 'customer_dataset.csv' with your actual file name or path
file_path = "C:/Users/user/OneDrive/Desktop/Onedrive_YuyanDipsy/OneDrive/UM Y4S1/WIA3002 FYP 1 & 2/FYP2/Data/Soapan Santun/2021 - 2025 Customer - Copy.csv"

original_customer_dataset_name = "2021 - 2025 Customer - Copy.csv"

# Read dataset
customer_df = pd.read_csv(file_path)

# Show first few rows
customer_df.head()

Unnamed: 0,CustomerID,Date of Birth,Gender,City,State,Country
0,CUST0001,,,Sungai Besi,Kuala Lumpur,Malaysia
1,CUST0001,,,,,
2,CUST0002,,,Mutiara Damansara,Selangor,Malaysia
3,CUST0002,,,,,
4,CUST0003,,,Shah Alam,Selangor,Malaysia


### Data Profiling

In [4]:
## Create later

### Initial checking: (Before perform data cleaning) Check optional and mandatory columns 

In [5]:
# ============================================= STAGE 1: SCHEMA & COLUMN VALIDATION =============================================
# # Optional columns
def check_optional_columns(df, threshold=0.8):
    """
    Check optional columns for fill percentage and drop columns that are mostly empty.
    Returns the modified DataFrame and a friendly message.
    """
    
    optional_columns = ["Date of Birth", "Gender"]
    
    # Normalize column names
    backup_customer_original_df = df.copy()
    df.columns = df.columns.str.strip().str.lower()
    optional_columns = [col.lower() for col in optional_columns] 
     
    dropped_columns = []

    for col in optional_columns:
        if col in df.columns:
            fill_ratio = df[col].count() / len(df) # 
            if fill_ratio < threshold:
                dropped_columns.append(col)
                df.drop(columns=[col], inplace=True)  # Drop the column immediately
                # df[col].count(): This counts the number of non-missing (non-null/non-NaN) values in the current column (col).
                # len(df): This gives the total number of rows in the DataFrame.
                # fill_ratio: The division calculates the proportion of filled (non-missing) values in that column. A ratio of 1.0 means the column is entirely filled; a ratio of 0.1 means 90% of the values are missing.


    # Generate user-friendly message
    if dropped_columns:
        dropped_str = ", ".join(dropped_columns)
        message = (
            f"We noticed that very few entries were provided for {dropped_str}. "
            "These columns have been removed. "
            "Segmentation will still be performed using geographic (City, State, Country) "
            "and behavioral data (e.g., orders, purchase items, total spend)."
        )
    else:
        message = "All optional columns have enough data and are kept for analysis."
    
    return df, message

# Mandatory columns 
def check_mandatory_columns(df, threshold=0.8):
    """
    Check mandatory columns for missing values (column-wise).
    Does not drop columns â€” only warns user if any column is too incomplete.
    Returns the DataFrame and a message summarizing issues.
    """

    mandatory_columns = ["CustomerID", "City", "State", "Country"]

    # Normalize column names
    df.columns = df.columns.str.strip().str.lower()
    mandatory_columns = [col.lower() for col in mandatory_columns]

    missing_report = []
    warning_columns = []

    for col in mandatory_columns:
        if col in df.columns:
            fill_ratio = df[col].count() / len(df)
            missing_percent = (1 - fill_ratio) * 100

            missing_report.append(f"{col}: {missing_percent:.1f}% missing")

            # Warn if missing exceeds threshold
            if fill_ratio < (1 - threshold):
                warning_columns.append(col)
        else:
            # Handle case where column completely missing
            missing_report.append(f"{col}: column not found (100% missing)")
            warning_columns.append(col)

    # Generate friendly message
    if warning_columns:
        warning_str = ", ".join(warning_columns)
        message = (
            f"Some key fields have a high number of missing values: {warning_str}. "
            "The system will still continue cleaning and processing, "
            "but missing values will be handled automatically by our system. "
            "Please ensure your source data is as complete as possible for more accurate segmentation results.\n\n"
            "Missing Data Summary:\n" + "\n".join(missing_report)
        )
    else:
        message = (
            "All mandatory columns have sufficient data and are ready for cleaning.\n\n"
            "Missing Data Summary:\n" + "\n".join(missing_report)
        )

    return df, message


In [6]:
# --- Apply checks and save validate optional and mandatory dataset ---
# Step 1: Optional columns
optional_checked_customer_df, optional_check_message = check_optional_columns(customer_df)

# Step 2: Mandatory columns (just check, but keep all columns)
mandatory_checked_customer_df, mandatory_check_message = check_mandatory_columns(optional_checked_customer_df)

# Step 3: Save the updated dataset for cleaning
# Split the name and extension
base_name, ext = os.path.splitext(original_customer_dataset_name)

# Create new validate file name
validate_customer_file = f"{base_name}_checked{ext}"

# Save the validated dataset
mandatory_checked_customer_df.to_csv(validate_customer_file, index=False)

# --- Display results ---
print("Optional Columns Check:")
print(optional_check_message)

print("\nMandatory Columns Check:")
print(mandatory_check_message)

print(f"\nDataset saved as '{validate_customer_file}' after perform initial checking on optional and mandatory columns")

Optional Columns Check:
We noticed that very few entries were provided for date of birth, gender. These columns have been removed. Segmentation will still be performed using geographic (City, State, Country) and behavioral data (e.g., orders, purchase items, total spend).

Mandatory Columns Check:
All mandatory columns have sufficient data and are ready for cleaning.

Missing Data Summary:
customerid: 0.0% missing
city: 44.8% missing
state: 46.9% missing
country: 44.7% missing

Dataset saved as '2021 - 2025 Customer - Copy_checked.csv' after perform initial checking on optional and mandatory columns


In [7]:
# After perform initial checking on optional and mandatory columns
customer_df.head()

Unnamed: 0,customerid,city,state,country
0,CUST0001,Sungai Besi,Kuala Lumpur,Malaysia
1,CUST0001,,,
2,CUST0002,Mutiara Damansara,Selangor,Malaysia
3,CUST0002,,,
4,CUST0003,Shah Alam,Selangor,Malaysia


### Perform Data Cleaning Pipeline - CustomerDataset

In [8]:
# ============================================= STAGE 2: REMOVE DUPLICATE ENTRY ROW =============================================
def remove_duplicate_entries(df):
    """Remove duplicate rows, keeping the first occurrence"""
    df = df.drop_duplicates(keep='first')
    return df

# ============================================= STAGE 3: STANDARDIZATION & NORMALIZATION =============================================

def normalize_columns_name(df):
    """Normalize column names: lowercase, strip spaces"""
    df.columns = df.columns.str.strip().str.lower()
    return df

# ===============================================================================

def standardize_customer_id(df):
    """Standardize CustomerID format"""
    if 'customerid' in df.columns:
        df['customerid'] = df['customerid'].astype(str).str.strip().str.upper()
    return df

# ===============================================================================

def standardize_dob(df):
    """Standardize Date of Birth column and convert to YYYY-MM-DD"""
    # Rename only 'date of birth' to 'dob'
    df = df.rename(columns={'date of birth': 'dob'})  
    if 'dob' in df.columns:
        def parse_date(x):
            if pd.isnull(x):
                return np.nan
            for format in ("%d/%m/%Y", "%m-%d-%y", "%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y"):    
                try:
                    return datetime.strptime(str(x), format).date() # Final format: YYYY-MM-DD | 2025-10-15
                except Exception:
                    continue
            return np.nan  # If no valid format found
        df['dob'] = df['dob'].apply(parse_date)
    return df

# %d/%m/%Y â†’ 12/05/2000
# %m-%d-%y â†’ 05-12-00
# %Y-%m-%d â†’ 2000-05-12
# %d-%b-%Y â†’ 12-May-2000
# %d-%m-%Y â†’ 12-5-2000

# ===============================================================================

def derive_age_features(df):
    """Derive Age from DOB"""
    if 'dob' in df.columns:
        today = date.today()
        df['age'] = df['dob'].apply(
            lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day))
            if pd.notnull(x) else np.nan
        )
    return df
# Example: ((today.month, today.day) < (x.month, x.day))
# (10,15) < (12,1) â†’ True (birthday in Dec is after Oct 15)
# (10,15) < (10,16) â†’ True (birthday tomorrow)
# (10,15) < (5,20) â†’ False (birthday already passed)

# This function calculates each personâ€™s age from their date of birth (dob) by subtracting their birth year from the current year and adjusting if their birthday hasnâ€™t occurred yet this year.

# ===============================================================================

def derive_age_group(df):
    """Derive Age Group based on defined buckets"""
    if 'age' in df.columns:
        def categorize_age(age):
            if pd.isnull(age):
                return 'Unknown'
            if age < 18: return 'Below 18'
            elif 18 <= age <= 24: return '18-24'
            elif 25 <= age <= 34: return '25-34'
            elif 35 <= age <= 44: return '35-44'
            elif 45 <= age <= 54: return '45-54'
            elif 55 <= age <= 64: return '55-64'
            else: return 'Above 65'
        df['age_group'] = df['age'].apply(categorize_age)
    return df
# ===============================================================================

def drop_dob_after_age_derived(df):
    """Drop DOB column after deriving age and age_group"""
    if 'dob' in df.columns:
        df = df.drop(columns=['dob'])
    return df

# =================================================================================

def standardize_gender(df):
    """Clean and standardize gender values"""
    if 'gender' in df.columns:
        # Clean text (remove spaces, make lowercase)
        df['gender'] = df['gender'].astype(str).str.strip().str.lower()

        # Standardize using keyword detection
        def detect_gender(value):
            if any(word in value for word in ['m', 'male', 'man', 'boy']):
                return 'Male'
            elif any(word in value for word in ['f', 'female', 'woman', 'girl']):
                return 'Female'
            else:
                return 'Unknown'

        df['gender'] = df['gender'].apply(detect_gender)
    return df

# ==================================================================================

def standardize_location(df):
    """Standardize City, State, Country using fuzzy and pycountry"""
    #  City 
    if 'city' in df.columns:
        df['city'] = df['city'].astype(str).str.title().str.strip()

    # State 
    if 'state' in df.columns:
        states = [sub.name for sub in pycountry.subdivisions if sub.country_code == 'MY']
        df['state'] = df['state'].astype(str).str.title().str.strip()
        df['state'] = df['state'].apply(
            lambda x: process.extractOne(x, states, scorer=fuzz.token_sort_ratio)[0] if x else 'Unknown'
        )

    # Country ---
    if 'country' in df.columns:
        countries = [c.name for c in pycountry.countries]
        df['country'] = df['country'].astype(str).str.title().str.strip()
        df['country'] = df['country'].apply(
            lambda x: process.extractOne(x, countries, scorer=fuzz.token_sort_ratio)[0]
            if x and len(x) > 2 else 'Malaysia'
        )
    return df

In [9]:
# ============================================= STAGE 4: MISSING VALUE HANDLING =============================================

def handle_missing_values(df):
    """
    Handle missing values using a column-based approach.
    Each column is treated independently based on its type and business logic.
    """

    # ----- 1. Customer ID -----
    if 'customerid' in df.columns:
        df = df[df['customerid'].notna()]  # Drop rows without ID

    # ----- 2. Age -----
    if 'age' in df.columns:
        missing_ratio = df['age'].isna().mean()

        if missing_ratio > 0:
            print(f"Missing ratio for age: {missing_ratio:.2%}")
            
            if 'gender' in df.columns and df['gender'].nunique() > 1:
                # Group by gender if available
                df['age'] = df.groupby('gender')['age'].transform(
                    lambda x: x.fillna(x.median())
                )
                print("Applied gender-based median imputation for age.")
            else:
                # No gender column found or only one unique gender
                print("Gender column not available or not diverse. Using overall median for imputation.")
            
            # Always fill any remaining missing values with overall median
            df['age'] = df['age'].fillna(df['age'].median())
            
            # Update derived column if needed
            df = derive_age_group(df)


    # ----- 4. Gender -----
    if 'gender' in df.columns:
        df['gender'] = df['gender'].fillna(df['gender'].mode()[0])

    # ----- 5. City, State, Country -----
    for col in ['city', 'state', 'country']:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])

    return df

In [10]:
# =============================================(WIP) STAGE 5: OUTLIER DETECTION =============================================

def detect_outliers(df):
    """Detect outliers in Age"""
    if 'age' in df.columns:
        df['age'] = df['age'].apply(lambda x: np.nan if pd.notnull(x) and (x < 0 or x > 110) else x)
    return df

# ============================================= STAGE 6: DEDUPLICATION =============================================
# This function removes duplicate CustomerIDs and keeps the row with the most non-missing data to preserve the most complete customer record.
def deduplicate_customers(df):
    """Keep the most complete record for each CustomerID"""
    if 'customerid' in df.columns:
        df = (
            df.loc[df.groupby('customerid').apply(lambda x: x.notna().sum(axis=1).idxmax())]
            .reset_index(drop=True)
        )
    return df

In [None]:
# ============================================= MAIN CLEANING PIPELINE =============================================

def clean_customer_dataset(df, validate_customer_file):
    """
    Main cleaning pipeline for customer dataset.
    This function executes all cleaning stages in sequence:
    1. Schema & Column Validation
    2. Duplicate Removal
    3. Standardization & Normalization
    4. Missing Value Handling
    5. Outlier Detection
    6. Deduplication
    Finally, it saves the cleaned dataset and returns it.
    """
    print("ðŸš€ Starting data cleaning pipeline...\n")
    print("========== STAGE 1: SCHEMA & COLUMN VALIDATION ==========")
    df, optional_msg = check_optional_columns(df)
    df, mandatory_msg = check_mandatory_columns(df)
    print(optional_msg)
    print(mandatory_msg)

    print("\n========== STAGE 2: REMOVE DUPLICATE ENTRY ROW ==========")
    df = remove_duplicate_entries(df)
    print("Duplicate entries removed.")

    print("\n========== STAGE 3: STANDARDIZATION & NORMALIZATION ==========")
    df = normalize_columns_name(df)
    df = standardize_customer_id(df)
    df = standardize_dob(df)
    df = derive_age_features(df)
    df = derive_age_group(df)
    df = drop_dob_after_age_derived(df)
    df = standardize_gender(df)
    df = standardize_location(df)
    print("Standardization and normalization completed.")

    print("\n========== STAGE 4: MISSING VALUE HANDLING ==========")
    df = handle_missing_values(df)
    print("Missing values handled successfully.")

    print("\n========== STAGE 5: OUTLIER DETECTION ==========")
    df = detect_outliers(df)
    print("Outliers handled (e.g., unrealistic ages set to NaN).")

    print("\n========== STAGE 6: DEDUPLICATION ==========")
    df = deduplicate_customers(df)
    print("Duplicate CustomerIDs deduplicated by data completeness.")

    # ----- Save Cleaned Dataset -----
    base_name, ext = os.path.splitext(original_dataset_name)
    cleaned_file = f"{base_name}_cleaned{ext}"
    df.to_csv(cleaned_file, index=False)

    print("\nâœ… Data cleaning pipeline completed successfully!")
    print(f"Cleaned dataset saved as: {cleaned_file}")
    print("Complete data cleaning pipeline execution for customer dataset.")
    return df, cleaned_file


In [13]:
clean_customer_dataset(customer_df, validate_customer_file)

ðŸš€ Starting data cleaning pipeline...

All optional columns have enough data and are kept for analysis.
All mandatory columns have sufficient data and are ready for cleaning.

Missing Data Summary:
customerid: 0.0% missing
city: 44.8% missing
state: 46.9% missing
country: 44.7% missing

Duplicate entries removed.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['customerid'] = df['customerid'].astype(str).str.strip().str.upper()


Standardization and normalization completed.

Missing values handled successfully.

Outliers handled (e.g., unrealistic ages set to NaN).

Duplicate CustomerIDs deduplicated by data completeness.

âœ… Data cleaning pipeline completed successfully!
Cleaned dataset saved as: 2021 - 2025 Customer - Copy_checked_cleaned.csv
Complete data cleaning pipeline execution for customer dataset.


  df.loc[df.groupby('customerid').apply(lambda x: x.notna().sum(axis=1).idxmax())]


(     customerid               city                             state   country
 0      CUST0001        Sungai Besi  Wilayah Persekutuan Kuala Lumpur  Malaysia
 1      CUST0002  Mutiara Damansara                          Selangor  Malaysia
 2      CUST0003          Shah Alam                          Selangor  Malaysia
 3      CUST0004               Muar                             Johor  Malaysia
 4      CUST0005             Others                          Selangor  Malaysia
 ...         ...                ...                               ...       ...
 3194   CUST3195      Petaling Jaya                          Selangor  Malaysia
 3195   CUST3196       Kuala Lumpur  Wilayah Persekutuan Kuala Lumpur  Malaysia
 3196   CUST3197           Selangor                          Kelantan  Malaysia
 3197   CUST3198               ****                          Kelantan  Malaysia
 3198   CUST3199               ****                          Kelantan  Malaysia
 
 [3199 rows x 4 columns],
 '2021 - 202