## Customer Dataset Cleaning

In [1]:
# !pip install fuzzywuzzy
# !python -m pip install --upgrade pip
# !pip install python-Levenshtein

In [2]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime
from fuzzywuzzy import process


In [3]:
# Step 2: Load dataset
# Replace 'customer_dataset.csv' with your actual file name or path
file_path = "C:/Users/user/OneDrive/Desktop/Onedrive_YuyanDipsy/OneDrive/UM Y4S1/WIA3002 FYP 1 & 2/FYP2/Data/Soapan Santun/2021 - 2025 Customer.csv"

original_dataset_name = "2021 - 2025 Customer.csv"

# Read dataset
df = pd.read_csv(file_path)

# Show first few rows
df.head()

Unnamed: 0,CustomerID,Date of Birth,Gender,Income Level,City,State,Country
0,CUST0001,,female,,Sungai Besi,Kuala Lumpur,Malaysia
1,CUST0001,,female,,,,
2,CUST0002,,female,,Mutiara Damansara,Selangor,Malaysia
3,CUST0002,,female,,,,
4,CUST0003,,female,,Shah Alam,Selangor,Malaysia


### Initial checking: (Before perform data cleaning) Check optional and mandatory columns 

In [4]:
# Optional columns
def check_optional_columns(df, threshold=0.5):
    """
    Check optional columns for fill percentage and drop columns that are mostly empty.
    Returns the modified DataFrame and a friendly message.
    """
    
    optional_columns = ["Date of Birth", "Gender", "Income Level"]

    # Normalize column names
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.lower()
    optional_columns = [col.lower() for col in optional_columns]

    dropped_columns = []

    for col in optional_columns:
        if col in df.columns:
            fill_ratio = df[col].count() / len(df) # 
            if fill_ratio < threshold:
                dropped_columns.append(col)
                df.drop(columns=[col], inplace=True)  # Drop the column immediately
                # df[col].count(): This counts the number of non-missing (non-null/non-NaN) values in the current column (col).
                # len(df): This gives the total number of rows in the DataFrame.
                # fill_ratio: The division calculates the proportion of filled (non-missing) values in that column. A ratio of 1.0 means the column is entirely filled; a ratio of 0.1 means 90% of the values are missing.


    # Generate user-friendly message
    if dropped_columns:
        dropped_str = ", ".join(dropped_columns)
        message = (
            f"We noticed that very few entries were provided for {dropped_str}. "
            "These columns have been removed. "
            "Segmentation will still be performed using geographic (City, State, Country) "
            "and behavioral data (e.g., orders, purchase items, total spend)."
        )
    else:
        message = "All optional columns have enough data and are kept for analysis."
    
    return df, message

# Mandatory columns 
def check_mandatory_columns(df, threshold=0.2):
    """
    Check mandatory columns for missing values and return a friendly message.
    We will continue processing even if too many rows are missing.
    """
    
    mandatory_columns = ["CustomerID", "City", "State", "Country"] 

    # Normalize column names
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.lower()
    mandatory_columns = [col.lower() for col in mandatory_columns]

    # Calculate total number of rows with missing mandatory values
    invalid_rows = df[mandatory_columns].isnull().any(axis=1)
    percent_invalid = invalid_rows.sum() / len(df)

    if percent_invalid > threshold:
        message = (
            f"Your dataset has too many missing values in required fields "
            f"({percent_invalid*100:.0f}% of rows). "
            "The system can still clean and process the dataset with the information you have, "
            "but please note that the results may be slightly biased and not fully represent all your customers. "
            "You can continue with the current file, but we advise to re-check your data source. Are you sure you want to continue?"
        )
    else:
        message = "Dataset passed initial validation for mandatory columns. Ready for cleaning!"

    return df, message

In [5]:
# --- Apply checks and save cleaned dataset ---
# Step 1: Optional columns
df, optional_check_message = check_optional_columns(df)

# Step 2: Mandatory columns (just check, but keep all columns)
df, mandatory_check_message = check_mandatory_columns(df)

# Step 3: Save the updated dataset for cleaning

# Split the name and extension
base_name, ext = os.path.splitext(original_dataset_name)

# Create new cleaned file name
cleaned_file = f"{base_name}_cleaned{ext}"

# Save the cleaned dataset
df.to_csv(cleaned_file, index=False)

# --- Display results ---
print("Optional Columns Check:")
print(optional_check_message)

print("\nMandatory Columns Check:")
print(mandatory_check_message)

print(f"\nCleaned dataset saved as '{cleaned_file}'")


Optional Columns Check:
We noticed that very few entries were provided for date of birth, income level. These columns have been removed. Segmentation will still be performed using geographic (City, State, Country) and behavioral data (e.g., orders, purchase items, total spend).

Mandatory Columns Check:
Your dataset has too many missing values in required fields (47% of rows). The system can still clean and process the dataset with the information you have, but please note that the results may be slightly biased and not fully represent all your customers. You can continue with the current file, but we advise to re-check your data source. Are you sure you want to continue?

Cleaned dataset saved as '2021 - 2025 Customer_cleaned.csv'


In [6]:
# After perform initial checking on optional and mandatory columns
df.head()

Unnamed: 0,customerid,gender,city,state,country
0,CUST0001,female,Sungai Besi,Kuala Lumpur,Malaysia
1,CUST0001,female,,,
2,CUST0002,female,Mutiara Damansara,Selangor,Malaysia
3,CUST0002,female,,,
4,CUST0003,female,Shah Alam,Selangor,Malaysia


### Perform Data Cleaning

In [7]:
def clean_customer_dataset(df, order_df=None):
    """
    Automatically clean the customer dataset for segmentation.
    Handles both mandatory and optional columns flexibly.
    """

    # Step 1: Normalize column names
    df.columns = df.columns.str.strip().str.lower()

    # Step 2: Remove duplicate rows
    if "customerid" in df.columns:
        df.drop_duplicates(subset=["customerid"], inplace=True)
    else:
        df.drop_duplicates(inplace=True)

    # Step 3: Clean CustomerID
    if "customerid" in df.columns:
        df["customerid"] = df["customerid"].astype(str).str.strip().str.upper()

    # Step 4: Cross-check with orders (if provided)
    if order_df is not None and "customerid" in order_df.columns:
        valid_ids = set(order_df["customerid"].astype(str).str.strip().str.upper())
        df = df[df["customerid"].isin(valid_ids)]

    # Step 5: Handle optional columns
    optional_cols = ["date of birth", "gender", "income level"]
    for col in optional_cols:
        if col not in df.columns:
            continue

        # --- Date of Birth ---
        if col == "date of birth":
            df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
            if df[col].isnull().mean() > 0.5:
                df.drop(columns=[col], inplace=True)

        # --- Gender ---
        elif col == "gender":
            df[col] = df[col].str.strip().str.lower()
            gender_map = {
                "m": "Male", "male": "Male",
                "f": "Female", "female": "Female"
            }
            df[col] = df[col].map(gender_map).fillna("Other")

        # --- Income Level ---
        elif col == "income level":
            df[col] = df[col].astype(str).str.strip().str.lower()
            income_map = {
                "low": "Low", "lower": "Low",
                "medium": "Medium", "mid": "Medium", "med": "Medium", "middle": "Medium",
                "high": "High", "upper": "High"
            }
            df[col] = df[col].replace(income_map)
            # if mostly missing, drop
            if df[col].isnull().mean() > 0.5:
                df.drop(columns=[col], inplace=True)

    # Step 6: Clean City, State, Country
    for col in ["city", "state", "country"]:
        if col not in df.columns:
            continue
        df[col] = df[col].astype(str).str.strip().str.title()

    # Step 7: Save cleaned dataset
    return df
