## Dataset Cleaning

In [1]:
# %pip install fuzzywuzzy
# !python -m pip install --upgrade pip
# !pip install pycountry
# %pip install python-Levenshtein
# %pip install pandas numpy matplotlib seaborn scikit-learn jupyter

# errors='coerce' in pd.to_numeric() means:
# ‚ÄúTry to convert everything in this column into a number.
# If it fails (e.g. the value is text or invalid), don‚Äôt crash ‚Äî just replace it with NaN.‚Äù

In [2]:
# Activate kernal:
# .\venv_fyp\Scripts\activate

# Step 1: Import libraries
import pandas as pd
import numpy as np
import os
import pycountry
import re
import requests
import time
from datetime import datetime, date
from fuzzywuzzy import process, fuzz

### Data Profiling

In [3]:
## Create later

### Generic Function

In [4]:
# ============================================ GENERIC FUNCTIONS ============================================ #

def normalize_columns_name(df):
    """Normalize column names: lowercase, strip spaces"""
    print("[LOG] Running normalize_columns_name...")
    df.columns = df.columns.str.strip().str.lower()
    print(f"[LOG] Columns after normalization: {list(df.columns)}")
    return df

def check_mandatory_columns(df, dataset_type, mandatory_columns, threshold=0.8):
    """
    Generic function to check mandatory columns for both Customer and Order datasets.
    - dataset_type: 'customer' or 'order'
    - mandatory_columns: list of required columns for that dataset
    - threshold: minimum acceptable fill ratio (default 0.8)
    """
    print(f"[LOG] Running check_mandatory_columns for {dataset_type} dataset...")

    missing_report = []
    warning_columns = []

    # Step 1: Check each mandatory column
    for col in mandatory_columns:
        if col in df.columns:
            fill_ratio = df[col].notna().mean()
            print(f"[LOG] Mandatory column '{col}' fill ratio: {fill_ratio:.2f}")
            missing_percent = (1 - fill_ratio) * 100
            missing_report.append(f"{col}: {missing_percent:.1f}% missing")

            if fill_ratio < threshold:
                warning_columns.append(col)
        else:
            print(f"[LOG] Mandatory column '{col}' not found")
            missing_report.append(f"{col}: column not found (100% missing)")
            warning_columns.append(col)

    # Step 2: Generate message
    if warning_columns:
        warning_str = ", ".join(warning_columns)
        message = (
            f"Some key fields in the {dataset_type} dataset have a high number of missing values: {warning_str}. "
            "The system will continue cleaning and handle missing values automatically, "
            "but we STRONGLY encourage you to reupload your source data, ensure it was as complete as possible for accurate segmentation result later.\n\n"
            "Missing Data Summary:\n" + "\n".join(missing_report)
        )
    else:
        message = (
            f"All mandatory columns in the {dataset_type} dataset have sufficient data and are ready for cleaning.\n\n"
            "Missing Data Summary:\n" + "\n".join(missing_report)
        )

    return df, message

def remove_duplicate_entries(df):
    """Remove duplicate rows, keeping the first occurrence"""
    print("[LOG] Running remove_duplicate_entries...")
    initial_len = len(df)
    df = df.drop_duplicates(keep='first', ignore_index=True)
    print(f"[LOG] Removed {initial_len - len(df)} duplicate rows.")
    return df

def standardize_customer_id(df):
    """Standardize CustomerID format (null value is '')"""
    print("[LOG] Running standardize_customer_id...")
    if 'customerid' in df.columns:
        # Fill NaN with empty string before converting to string
        df.loc[:, 'customerid'] = df['customerid'].fillna('').astype(str).str.strip().str.upper()
        print("[LOG] CustomerID column standardized")
    else:
        print("[LOG] CustomerID column not found, skipping")
    return df
    # Might have special case of dirty data exist such as "****", "1234....", "annbwbciwbciowb"
    # not sure how to handle it (Currently will say bcs we focus on small business enterprise that have use digital system, so normally customerID will not have inconsistent format issue, even the inconsistant format exist, at the end this row will not be use as when we merge we cant found that customerID)


## Perform Data Cleaning Pipeline - CustomerDataset

In [5]:
# Step 2: Load dataset
# Replace 'customer_dataset.csv' with your actual file name or path
customer_file_path = "C:/Users/user/OneDrive/Desktop/Onedrive_YuyanDipsy/OneDrive/UM Y4S1/WIA3002 FYP 1 & 2/FYP2/Data/Soapan Santun - Use/2021 - 2025 Customer - Copy.csv"
# customer_file_path = "C:/Users/user/OneDrive/Desktop/Onedrive_YuyanDipsy/OneDrive/UM Y4S1/WIA3002 FYP 1 & 2/FYP2/Data/Test Data/customer_dataset_balanced_5033_rows.csv"
# customer_file_path = "C:/Users/user/OneDrive/Desktop/Onedrive_YuyanDipsy/OneDrive/UM Y4S1/WIA3002 FYP 1 & 2/FYP2/Data/Test Data/customer_dataset_light_7719_rows.csv"
# customer_file_path = "C:/Users/user/OneDrive/Desktop/Onedrive_YuyanDipsy/OneDrive/UM Y4S1/WIA3002 FYP 1 & 2/FYP2/Data/Test Data/customer_dataset_stress_6047_rows.csv"

original_customer_dataset_name = "2021 - 2025 Customer - Copy.csv"

# Read dataset
customer_df = pd.read_csv(customer_file_path)

# Show first few rows (original raw data)
customer_df.head()

Unnamed: 0,CustomerID,Date of Birth,Gender,City,State
0,CUST0001,,,Sungai Besi,Kuala Lumpur
1,CUST0001,,,,
2,CUST0002,,,Mutiara Damansara,Selangor
3,CUST0002,,,,
4,CUST0003,,,Shah Alam,Selangor


In [6]:
# ============================================= (CUSTOMER DATASET) STAGE 0: NORMALIZE COLUMN NAMES =============================================
# From Generic function: normalize_columns_name

# ============================================= (CUSTOMER DATASET) STAGE 1: SCHEMA & COLUMN VALIDATION =============================================
# Optional columns & Mandatory columns(FROM GENERIC FUNCTION - check_mandatory_columns)
def customer_check_optional_columns(df, threshold=0.8):
    """
    Check optional columns for fill percentage and drop columns that are mostly empty.
    Returns the modified DataFrame and a friendly message.
    """
    print("[LOG] Running customer_check_optional_columns...")
    optional_columns = ["date of birth", "gender"]
    dropped_columns = []
    missing_report = []

    for col in optional_columns:
        if col in df.columns:
            fill_ratio = df[col].notna().mean()
            missing_percent = (1 - fill_ratio) * 100
            missing_report.append(f"{col}: {missing_percent:.1f}% missing")
            print(f"[LOG] Optional column '{col}' fill ratio: {fill_ratio:.2f}")
            if fill_ratio < threshold:
                dropped_columns.append(col)
                df.drop(columns=[col], inplace=True)  # Drop the column immediately
                # df[col].count(): This counts the number of non-missing (non-null/non-NaN) values in the current column (col).
                # len(df): This gives the total number of rows in the DataFrame.
                # fill_ratio: The division calculates the proportion of filled (non-missing) values in that column. A ratio of 1.0 means the column is entirely filled; a ratio of 0.1 means 90% of the values are missing.
                print(f"[LOG] Dropped optional column '{col}' due to too many missing values")
        else:
            print(f"[LOG] Optional column '{col}' not found")
            missing_report.append(f"{col}: column not found (100% missing)")
            dropped_columns.append(col)

    # Generate user-friendly message
    if dropped_columns:
        dropped_str = ", ".join(dropped_columns)
        message = (
            f"We noticed that very few entries were provided for {dropped_str}. "
            "These columns have been removed. "
            "Segmentation will still be performed using geographic (City, State) "
            "and behavioral data (e.g., orders, purchase items, total spend).\n\n"
            "Missing Data Summary:\n" + "\n".join(missing_report)
        )
    else:
        message = (
            "All optional columns have enough data and are kept for analysis.\n\n"
            "Missing Data Summary:\n" + "\n".join(missing_report)
        )
    
    return df, message

# ============================================= (CUSTOMER DATASET) STAGE 2: REMOVE DUPLICATE ENTRY ROW =================================================
# From Generic function: remove_duplicate_entries

# ============================================= (CUSTOMER DATASET) STAGE 3: DEDUPLICATE =================================================
def deduplicate_customers(df):
    print("[LOG] Running deduplicate_customers...")
    if 'customerid' not in df.columns:
        print("[LOG] 'customerid' column missing, skipping deduplication")
        return df

    def resolve_conflict(series):
        vals = series.dropna().unique()
        if len(vals) == 0:
            return pd.NA
        elif len(vals) == 1:
            return vals[0]
        else:
            return series.mode().iloc[0]

    # ‚ö° Vectorized groupby instead of per-group loop
    df = df.groupby('customerid', as_index=False).agg(resolve_conflict)
    print("[LOG] Deduplication complete (vectorized)")
    return df

# ============================================= (CUSTOMER DATASET) STAGE 4: STANDARDIZATION & NORMALIZATION =============================================
# From Generic function: standardize_customer_id

def standardize_dob(df):
    """Standardize Date of Birth column and convert to YYYY-MM-DD"""
    print("[LOG] Running standardize_dob...")
    # Rename only 'date of birth' to 'dob'
    df = df.rename(columns={'date of birth': 'dob'})  
    if 'dob' in df.columns:
        print("[LOG] DOB column found, parsing dates...")
        def parse_date(x):
            if pd.isnull(x):
                return pd.NaT
            for format in ("%d/%m/%Y", "%m-%d-%y", "%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y"):    
                try:
                    return datetime.strptime(str(x), format).date() # Final format: YYYY-MM-DD | 2025-10-15
                except Exception:
                    continue
            return pd.NaT  # If no valid format found
        df['dob'] = df['dob'].apply(parse_date)
        df['dob'] = pd.to_datetime(df['dob'])
        print("[LOG] DOB parsing complete. Invalid dates marked as NaT")
    else:
        print("[LOG] DOB column not found, skipping")
    return df

    # %d/%m/%Y ‚Üí 12/05/2000
    # %m-%d-%y ‚Üí 05-12-00
    # %Y-%m-%d ‚Üí 2000-05-12
    # %d-%b-%Y ‚Üí 12-May-2000
    # %d-%m-%Y ‚Üí 12-5-2000

# ===============================================================================

def derive_age_features(df):
    """Derive Age from DOB"""
    print("[LOG] Running derive_age_features...")
    if 'dob' in df.columns:
        today = date.today()
        df['age'] = df['dob'].apply(
            lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day))
            if pd.notnull(x) else None
        )
        df['age'] = pd.to_numeric(df['age'], errors='coerce')
        print("[LOG] Age derived from DOB")
    else:
        print("[LOG] DOB column not found, skipping")
    return df
    # Example: ((today.month, today.day) < (x.month, x.day))
    # (10,15) < (12,1) ‚Üí True (birthday in Dec is after Oct 15)
    # (10,15) < (10,16) ‚Üí True (birthday tomorrow)
    # (10,15) < (5,20) ‚Üí False (birthday already passed)

    # This function calculates each person‚Äôs age from their date of birth (dob) by subtracting their birth year from the current year and adjusting if their birthday hasn‚Äôt occurred yet this year.

# ===============================================================================

def derive_age_group(df):
    """Derive Age Group based on defined buckets"""
    print("[LOG] Running derive_age_group...")
    if 'age' in df.columns:
        def categorize_age(age):
            if pd.isnull(age):
                return 'Unknown'
            if age < 18: return 'Below 18'
            elif 18 <= age <= 24: return '18-24'
            elif 25 <= age <= 34: return '25-34'
            elif 35 <= age <= 44: return '35-44'
            elif 45 <= age <= 54: return '45-54'
            elif 55 <= age <= 64: return '55-64'
            else: return 'Above 65'
        df['age_group'] = df['age'].apply(categorize_age)
        print("[LOG] Age groups derived")
    else:
        print("[LOG] Age column not found, skipping")
    return df

# ===============================================================================

def drop_dob_after_age_derived(df):
    """Drop DOB column after deriving age and age_group"""
    print("[LOG] Running drop_dob_after_age_derived...")
    if 'dob' in df.columns:
        df = df.drop(columns=['dob'])
        print("[LOG] Dropped DOB column")
    else:
        print("[LOG] DOB column not found, skipping")
    return df

# =================================================================================

def standardize_gender(df):
    """Clean and standardize gender values"""
    print("[LOG] Running standardize_gender...")
    if 'gender' in df.columns:
        df['gender'] = (
            df['gender']
            .astype(str)
            .str.strip()
            .str.lower()
            .replace({
                'm': 'Male', 'male': 'Male', 'man': 'Male', 'boy': 'Male',
                'f': 'Female', 'female': 'Female', 'woman': 'Female', 'girl': 'Female'
            })
        )
        df.loc[~df['gender'].isin(['Male', 'Female']), 'gender'] = 'Unknown'
        print("[LOG] Gender standardized (vectorized)")
    else:
        print("[LOG] Gender column not found, skipping")
    return df

# ==================================================================================

def standardize_location(df):
    """Standardize City, and State fields"""
    print("[LOG] Running standardize_location...")
        
    # Helper function: detect suspicious city names
    def is_suspicious_city(name):
        if not name or name.strip() == '':
            return True
        name = str(name).strip()
        
        if len(name) < 2 or len(name) > 50: # Too short or too long
            return True
        
        if re.search(r'[^A-Za-z\s\'-]', name):  # Contains non-alphabetic or weird symbols   # letters, space, apostrophe, dash allowed
            return True
        
        if re.search(r'(.)\1{3,}', name):   # Repeated characters (e.g., "Ccciiiittty")
            return True
        return False

    # --- City ---
    if 'city' in df.columns:
        df['city'] = df['city'].fillna('').astype(str).str.title().str.strip()
        # Common city aliases (short forms, local spellings, etc.)
        city_alias_map = {
            "Kl": "Kuala Lumpur",
            "PJ": "Petaling Jaya",
        }
        # Apply alias replacements first
        df['city'] = df['city'].replace(city_alias_map)
        
        suspicious_mask = df['city'].apply(lambda x: is_suspicious_city(x) or x.lower() in ['others', 'other'])
        suspicious_count = suspicious_mask.sum()
        df.loc[suspicious_mask, 'city'] = 'Unknown'
        
        print(f"[LOG] Standardized 'city'. Suspicious/unknown entries set to 'Unknown': {suspicious_count}")
    else:
        print("[LOG] 'city' column not found, skipping city standardization")
    
    # --- State ---
    if 'state' in df.columns:
        # malaysia_states = ["Johor", "Kedah", "Kelantan", "Melaka", "Negeri Sembilan","Pahang", "Perak", "Perlis", "Pulau Pinang", "Sabah", "Sarawak", "Selangor", "Terengganu", "Kuala Lumpur", "Labuan", "Putrajaya"]
        malaysia_states = [sub.name for sub in pycountry.subdivisions if sub.country_code == 'MY']
        alias_map = {
            "Kuala Lumpur": "Wilayah Persekutuan Kuala Lumpur",
            "Kl": "Wilayah Persekutuan Kuala Lumpur",
            "Labuan": "Wilayah Persekutuan Labuan",
            "Putrajaya": "Wilayah Persekutuan Putrajaya"
        }

        df['state'] = df['state'].fillna('').astype(str).str.title().str.strip()
        # ‚ö° Match only unique states once
        unique_states = df['state'].unique()
        state_map = {}
        for s in unique_states:
            s_clean = s.strip().title()

            if not s_clean or s_clean == 'Unknown':
                state_map[s] = 'Unknown'

            elif s_clean in alias_map:   # Check alias first
                state_map[s] = alias_map[s_clean]

            else:
                match, score = process.extractOne(s_clean, malaysia_states, scorer=fuzz.token_sort_ratio)
                state_map[s] = match if score >= 80 else 'Unknown'

        # Apply mapping to the dataframe
        df['state'] = df['state'].map(state_map)
        print("[LOG] State standardized (cached fuzzy matching)")
    else:
        print("[LOG] 'state' column not found, skipping state standardization")

    return df

# ============================================= (CUSTOMER DATASET) STAGE 5: MISSING VALUE HANDLING =============================================
def handle_missing_values_customer(df):
    print("[LOG] Running handle_missing_values...")

    API_KEY = "68f8ce9a38c3f632237334dyiedb96e"
    GEOCODE_URL = "https://geocode.maps.co/search"
    SLEEP_TIME = 1.2
    cache = {}  # ‚ö° moved outside loops

    # --- Drop rows without ID ---
    if 'customerid' in df.columns:
        before_drop = len(df)
        df = df[df['customerid'].notna()].copy()
        print(f"[LOG] Dropped {before_drop - len(df)} rows without CustomerID")
    else:
        print("[LOG] 'customerid' column missing, skipping drop")

    # ----- Age -----
    if 'age' in df.columns:
        missing_ratio = df['age'].isna().mean()
        print(f"[LOG] Age missing ratio: {missing_ratio:.2%}")
        if missing_ratio > 0:
            if 'gender' in df.columns and df['gender'].nunique() > 1:
                # Group by gender if available
                df['age'] = df.groupby('gender')['age'].transform(
                    lambda x: x.fillna(x.median())
                )
                print("[LOG] Applied gender-based median imputation for missing age")
                
                # Always fill any remaining missing values with overall median 
                df['age'] = df['age'].fillna(df['age'].median())
                
                # Update derived column if needed
                df = derive_age_group(df)
        else:
            print("[LOG] No missing values for age")
    else:
        print("[LOG] 'age' column missing, skipping age imputation")

    # --- Gender mode imputation ---
    if 'gender' in df.columns:
        mode_series = df.loc[df['gender'].isin(['Male', 'Female']), 'gender'].mode()

        if not mode_series.empty:
            mode = mode_series[0]
            unknown_mask = df['gender'] == 'Unknown'
            count = unknown_mask.sum()
            if count > 0:
                df.loc[unknown_mask, 'gender'] = mode
                print(f"[LOG] Replaced {count} 'Unknown' gender values with mode: {mode}")
        else:
            print("[WARN] No valid gender mode found (only 'Unknown' present) ‚Äî skipping imputation.")
    else:
        print("[LOG] 'gender' column missing, skipping gender imputation")

   
    # --- City & State handling ---
    if {'city', 'state'}.issubset(df.columns):
        print("\nüîç Handling missing city/state values...")
        malaysia_states = [sub.name for sub in pycountry.subdivisions if sub.country_code == 'MY']
        cache = {}  # city -> validated state
        SLEEP_TIME = 1.2
        
        # Case 1: missing state but city known ‚Üí fill via geocoding API
        print("\n[LOG] Case 1: Filling missing state where city is known...")
        # Get rows needing state fill (city known, state unknown)
        mask_case1 = (df['city'] != 'Unknown') & (df['state'] == 'Unknown')
        cities_to_query = df.loc[mask_case1, 'city'].unique().tolist()

        print(f"[LOG] {len(cities_to_query)} unique cities need state lookup")
        
        for city in cities_to_query:
            if city not in cache:
                # Call API
                try:
                    resp = requests.get(GEOCODE_URL, params={"q": f"{city}, Malaysia", "api_key": API_KEY}, timeout=10)
                    if resp.status_code == 200:
                        data = resp.json()
                        if isinstance(data, list) and data:
                            state_name = data[0].get("address", {}).get("state")
                            if state_name and state_name in malaysia_states:
                                cache[city] = state_name
                            else:
                                cache[city] = None
                        else:
                            cache[city] = None
                    else:
                        cache[city] = None
                    time.sleep(SLEEP_TIME)
                except Exception as e:
                    print(f"[WARN] Failed to get state for city '{city}': {e}")
                    cache[city] = None

            # Fill values
            fill_state = cache.get(city)
            if fill_state:
                df.loc[(df['city'] == city) & (df['state'] == 'Unknown'), 'state'] = fill_state
                print(f"[TRACE] Filled {city} ‚Üí state='{fill_state}' (API valid)")
            else:
                # Fallback: use mode state & mode city for that state
                # The API fails to find the city || The response doesn‚Äôt contain a valid "state" field || Or the returned "state" isn‚Äôt in the official Malaysia subdivision list.
                valid_states = df[df['state'] != 'Unknown']['state']
                mode_state = valid_states.mode()[0] if not valid_states.empty else 'Unknown'

                # Compute mode city per state
                mode_city_per_state = (
                    df[df['city'] != 'Unknown'].groupby('state')['city']
                    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown')
                    .to_dict()
                )
                mode_city = mode_city_per_state.get(mode_state, 'Unknown')

                mask_fill = (df['city'] == city) & (df['state'] == 'Unknown')
                df.loc[mask_fill, 'state'] = mode_state
                df.loc[mask_fill, 'city'] = mode_city
                print(f"[TRACE] Filled {mask_fill.sum()} row(s) ‚Üí city='{mode_city}', state='{mode_state}' (Fallback)")

        # Case 2: missing city but state known ‚Üí fill with mode city per state
        print("\n[LOG] Case 2: Filling missing city where state is known...")
        mask_case2 = (df['city'] == 'Unknown') & (df['state'] != 'Unknown')
        if mask_case2.any():
            mode_city_per_state = (
                df[df['city'] != 'Unknown'].groupby('state')['city']
                .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown')
                .to_dict()
            )
            for state, city_mode in mode_city_per_state.items():
                mask_fill = mask_case2 & (df['state'] == state)
                df.loc[mask_fill, 'city'] = city_mode
                print(f"[TRACE] Filled {mask_fill.sum()} row(s) ‚Üí missing city for state='{state}' ‚Üí city='{city_mode}'")

        # Case 3: both missing ‚Üí fill with most frequent pair
        print("\n[LOG] Case 3: Filling missing city and state...")
        mask_case3 = (df['city'] == 'Unknown') & (df['state'] == 'Unknown')
        if mask_case3.any():
            valid_pairs = df[(df['city'] != 'Unknown') & (df['state'] != 'Unknown')]
            if not valid_pairs.empty:
                city_mode, state_mode = valid_pairs.groupby(['city', 'state']).size().idxmax()
                df.loc[mask_case3, ['city', 'state']] = [city_mode, state_mode]
                print(f"[TRACE] Filled {mask_case3.sum()} row(s) ‚Üí missing city/state ‚Üí City='{city_mode}', State='{state_mode}'")
            else:
                print("[WARN] No valid city/state pairs to fill missing both values")

    return df

# ============================================= (CUSTOMER DATASET) STAGE 6: OUTLIER DETECTION =============================================
def customer_detect_outliers(df):
    """Adaptive outlier handling based on dataset size."""
    print("[LOG] Running detect_outliers...")
    if 'age' in df.columns:
        df['age'] = pd.to_numeric(df['age'], errors='coerce')
        # df['age_original'] = df['age']  # ‚úÖ Keep a copy of original age values (for comparison or re-deriving age_group)

        n = len(df)
        print(f"[LOG] Dataset has {n} rows")
        if n < 500:
            # IQR method
            Q1 = df['age'].quantile(0.25)
            Q3 = df['age'].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = df[(df['age'] <  Q1) | (df['age'] > Q3)].shape[0]
            df.loc[(df['age'] < lower_bound) | (df['age'] > upper_bound), 'age'] = np.nan
            print(f"[LOG] IQR Applied for {n} rows. Range: [{lower_bound:.1f}, {upper_bound:.1f}] Outliers set to NaN: {outliers}")
        else:
            # Percentile capping
            lower_bound = df['age'].quantile(0.01)
            upper_bound = df['age'].quantile(0.99)
            df['age'] = df['age'].clip(lower=lower_bound, upper=upper_bound)
            print(f"[LOG] Percentile capping  applied for {n} rows. Capped to [{lower_bound:.1f}, {upper_bound:.1f}]")
    else:
        print("[LOG] 'age' column missing, skipping outlier detection")
    return df


In [7]:
# ============================================= (CUSTOMER DATASET) DATASET CLEANING PIPELINE =============================================
def clean_customer_dataset(df, original_customer_dataset_name):
    """
    Main cleaning pipeline for customer dataset.
    Executes all stages in proper order:
    0. Column Normalization
    1. Schema & Column Validation
    2. Duplicate Entry Removal
    3. Standardization & Normalization
    4. Missing Value Handling
    5. Outlier Detection
    6. Deduplication
    Finally, saves the cleaned dataset and returns it.
    """
    print("üöÄ Starting customer data cleaning pipeline...\n")

    # =======================================================
    # STAGE 0: NORMALIZE COLUMN NAMES (FROM GENERIC FUNCTION)
    # =======================================================
    print("========== [STAGE 0 START] Normalize Column Names ==========")
    df = normalize_columns_name(df)
    print("‚úÖ [STAGE 0 COMPLETE] Column names normalized.\n")

    # =============================================
    # STAGE 1: SCHEMA & COLUMN VALIDATION
    # =============================================
    print("========== [STAGE 1 START] Schema & Column Validation ==========")
    df, optional_msg = customer_check_optional_columns(df)
    
    # (FROM GENERIC FUNCTION)
    customer_mandatory = ["customerid", "city", "state"]
    df, mandatory_msg = check_mandatory_columns(df,dataset_type="customer", mandatory_columns=customer_mandatory)

    print(optional_msg)
    print(mandatory_msg)
    print("‚úÖ [STAGE 1 COMPLETE] Schema validation done.\n")

    # ============================================================
    # STAGE 2: REMOVE DUPLICATE ENTRY ROWS (FROM GENERIC FUNCTION)
    # ============================================================
    print("========== [STAGE 2 START] Remove Duplicate Entry Rows ==========")
    df = remove_duplicate_entries(df)
    print("‚úÖ [STAGE 2 COMPLETE] Duplicate entries removed.\n")

    # =============================================
    # STAGE 3: DEDUPLICATION
    # =============================================
    print("========== [STAGE 3 START] Deduplication ==========")
    df = deduplicate_customers(df)
    print("‚úÖ [STAGE 3 COMPLETE] Duplicate CustomerIDs deduplicated.\n")

    # =============================================
    # STAGE 4: STANDARDIZATION & NORMALIZATION
    # =============================================
    print("========== [STAGE 4 START] Standardization & Normalization ==========")
    df = standardize_customer_id(df)
    df = standardize_dob(df)
    df = derive_age_features(df)
    df = derive_age_group(df)
    df = drop_dob_after_age_derived(df)
    df = standardize_gender(df)
    df = standardize_location(df)
    print("‚úÖ [STAGE 4 COMPLETE] Standardization and normalization finished.\n")
    
    print("========== [Standardized Customer Dataset START] Save Standardized Dataset ==========")
    base_name, ext = os.path.splitext(original_customer_dataset_name)
    standardized_file = f"{base_name}_standardized{ext}"
    df.to_csv(standardized_file, index=False)
    print(f"‚úÖ [STANDARDIZED STAGE COMPLETE] Standardized Customer dataset saved as: {standardized_file}\n")
    
    # =============================================
    # STAGE 5: MISSING VALUE HANDLING
    # =============================================
    print("========== [STAGE 5 START] Missing Value Handling ==========")
    df = handle_missing_values_customer(df)
    print("‚úÖ [STAGE 5 COMPLETE] Missing values handled.\n")

    # # =============================================
    # # STAGE 6: OUTLIER DETECTION
    # # =============================================
    print("========== [STAGE 6 START] Outlier Detection ==========")
    df = customer_detect_outliers(df)   # make sure detect_outliers returns df
    print("‚úÖ [STAGE 6 COMPLETE] Outliers handled.\n")

    # =============================================
    # SAVE CLEANED DATASET
    # =============================================
    print("========== [FINAL STAGE START] Save Cleaned Dataset ==========")
    # base_name, ext = os.path.splitext(original_customer_dataset_name)
    cleaned_file = f"{base_name}_cleaned{ext}"
    df.to_csv(cleaned_file, index=False)
    print(f"‚úÖ [FINAL STAGE COMPLETE] Cleaned dataset saved as: {cleaned_file}\n")

    print("==========================================================")
    print("üéâ Data cleaning pipeline completed successfully!\n")
    return df, cleaned_file

In [8]:
# Keep a copy for comparison
original_customer_df = customer_df.copy()

# >>>>>>>>>>>>>>>>>>>> (CUSTOMER DATASET) IMPLEMENT CLEANING PIPELINE >>>>>>>>>>>>>>>>>>>
cleaned_customer_df, cleaned_file_name = clean_customer_dataset(customer_df, original_customer_dataset_name)

# AFTER CLEANING : CLEANED VERSION TO BE STORED BACK TO DATABASE
cleaned_customer_df.head()

üöÄ Starting customer data cleaning pipeline...

[LOG] Running normalize_columns_name...
[LOG] Columns after normalization: ['customerid', 'date of birth', 'gender', 'city', 'state']
‚úÖ [STAGE 0 COMPLETE] Column names normalized.

[LOG] Running customer_check_optional_columns...
[LOG] Optional column 'date of birth' fill ratio: 0.00
[LOG] Dropped optional column 'date of birth' due to too many missing values
[LOG] Optional column 'gender' fill ratio: 0.00
[LOG] Dropped optional column 'gender' due to too many missing values
[LOG] Running check_mandatory_columns for customer dataset...
[LOG] Mandatory column 'customerid' fill ratio: 1.00
[LOG] Mandatory column 'city' fill ratio: 0.55
[LOG] Mandatory column 'state' fill ratio: 0.53
We noticed that very few entries were provided for date of birth, gender. These columns have been removed. Segmentation will still be performed using geographic (City, State) and behavioral data (e.g., orders, purchase items, total spend).

Missing Data Summ

Unnamed: 0,customerid,city,state
0,CUST0001,Sungai Besi,Wilayah Persekutuan Kuala Lumpur
1,CUST0002,Mutiara Damansara,Selangor
2,CUST0003,Shah Alam,Selangor
3,CUST0004,Muar,Johor
4,CUST0005,Petaling Jaya,Selangor


In [9]:
# ============================================= (CUSTOMER DATASET) DATASET CLEANING REPORT =============================================
def generate_cleaning_report(original_df, cleaned_df, filename_prefix="cleaning_summary"):
    report = {}

    # --- Missing Values ---
    report['Missing Values (Before)'] = original_df.isna().sum()
    report['Missing Values (After)'] = cleaned_df.isna().sum()

    # --- Duplicates ---
    duplicates_before = len(original_df) - len(original_df.drop_duplicates())
    duplicates_after = len(cleaned_df) - len(cleaned_df.drop_duplicates())
    report['Duplicates (Before)'] = [duplicates_before] * len(original_df.columns)
    report['Duplicates (After)'] = [duplicates_after] * len(original_df.columns)

    # --- Combine into Summary DataFrame ---
    summary = pd.DataFrame(report, index=original_df.columns)
    summary.index.name = "Column Name"

    # --- Save to CSV (with timestamp) ---
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{filename_prefix}_{timestamp}.csv"
    summary.to_csv(filename, index=True)

    # --- Print Summary (compact view) ---
    print("\nüìä === DATA CLEANING SUMMARY REPORT ===")
    print(summary.to_string())
    print(f"\n‚úÖ Report saved to: {filename}")

    return summary


In [10]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> (CUSTOMER DATASET) DATASET CLEANING PIPELINE >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# summary = generate_cleaning_report(original_customer_df, cleaned_customer_df)

## Perform Data Cleaning Pipeline - OrderDataset

In [11]:
# Step 2: Load dataset
# Replace 'customer_dataset.csv' with your actual file name or path
order_file_path = "C:/Users/user/OneDrive/Desktop/Onedrive_YuyanDipsy/OneDrive/UM Y4S1/WIA3002 FYP 1 & 2/FYP2/Data/Soapan Santun - Use/2021 - 2025 Order.csv"
original_order_dataset_name = "2021 - 2025 Order.csv"

# Read dataset
order_df = pd.read_csv(order_file_path)

# Show first few rows (original raw data)
order_df.head()

Unnamed: 0,OrderID,CustomerID,Purchase Item,Purchase Date,Item Price,Purchase Quantity,Total Spend,Transaction Method
0,211103JN8114CU,CUST0001,Aloe Vera Homemade Soap | Acne-Prone Skin | Ba...,03/11/2021 20:21,18.0,1.0,RM42.03,ShopeePay
1,211103JN8114CU,CUST0001,Oats & Calendula Dandruff Shampoo Bar | Sulfat...,03/11/2021 20:21,23.0,1.0,,Shopee Coin
2,211104MMPQH994,CUST0002,Calendula Homemade Soap | Suitable For Eczema-...,04/11/2021 15:17,23.0,2.0,$70,ShopeePay
3,211104MMPQH994,CUST0002,Green Tea & Mint Hair Loss Shampoo | Sulfate-F...,04/11/2021 15:17,24.0,1.0,,
4,211104MSJ8BGNC,CUST0003,Green Tea & Mint Hair Loss Shampoo | Sulfate-F...,04/11/2021 16:44,24.0,1.0,26,ShopeePay


In [12]:
# ============================================= (ORDER DATASET) STAGE 0: NORMALIZE COLUMN NAMES =============================================
# From Generic function: normalize_columns_name

# ============================================= (ORDER DATASET) STAGE 1: SCHEMA & COLUMN VALIDATION =============================================
# From Generic function

# ============================================= (ORDER DATASET) STAGE 2: REMOVE DUPLICATE ENTRY ROW =================================================
# From Generic function: remove_duplicate_entries

# ============================================= (ORDER DATASET) STAGE 3: STANDARDIZATION & NORMALIZATION =============================================
# From Generic function: standardize_customer_id

def standardized_order_id(df):
    """Standardize OrderID format (null is '')"""
    print("[LOG] Running standardize_order_id...")
    if 'orderid' in df.columns:
        # Fill NaN with empty string before converting to string
        df.loc[:, 'orderid'] = df['orderid'].fillna('').astype(str).str.strip().str.upper()
        print("[LOG] OrderID column standardized")
    else:
        print("[LOG] OrderID column not found, skipping")
    return df

def standardize_purchase_item(df):
    """"Standardize Purchase Item names (NaN preserved)"""
    print("[LOG] Running standardized_purchase_item...")
    if "purchase item" in df.columns:
        df.loc[:, "purchase item"] = (
            df["purchase item"]
            .astype(str)
            .str.strip()
            .str.title()
        )
        print("[LOG] Purchase item standardized")
    return df

def standardize_purchase_date(df):
    """Standardize Purchase Date into separate date and time columns(NaT preserved)"""
    print("[LOG] Running standardize_purchase_date...")

    if "purchase date" in df.columns:
        # Ensure df is a deep copy (prevents SettingWithCopyWarning)
        df = df.copy()
        
        # Clean values
        df.loc[:, "purchase date"] = df["purchase date"].astype(str).str.strip()

        # Convert to datetime
        df.loc[:, "purchase datetime"] = pd.to_datetime(
            df["purchase date"], errors="coerce", dayfirst=True
        )

        # Detect which rows have time info
        has_time_mask = df["purchase date"].str.contains(":", regex=False)

        # Create standardized columns
        df.loc[:, "purchase date"] = df["purchase datetime"].dt.strftime("%Y-%m-%d")
        df.loc[:, "purchase time"] = None
        df.loc[has_time_mask, "purchase time"] = (
            df.loc[has_time_mask, "purchase datetime"].dt.strftime("%H:%M:%S")
        )

        # Drop intermediate column
        df.drop(columns=["purchase datetime"], inplace=True, errors="ignore")
    else:
        print("[WARN] 'purchase date' column not found, skipping.")
    print("[LOG] Purchase date standardization complete.")
    return df

def standardized_item_price_and_total_spend(df):
    """
    Standardize 'item price' and 'total spend' columns:
    - Remove currency symbols/text (RM, $, MYR, etc.)
    - Convert to numeric (invalid entries ‚Üí NaN)
    - Round to 2 decimal places
    """
    print("[LOG] Running standardized_item_price_and_total_spend...")

    for col in ["item price", "total spend"]:
        if col in df.columns:
            # Step 1: Remove currency symbols and text
            df[col] = (
                df[col]
                .astype(str)
                .str.replace(r"[^\d\.\-]", "", regex=True)
            )

            # Step 2: Convert to numeric, coercing invalid values to NaN
            df[col] = pd.to_numeric(df[col], errors="coerce")

            # Step 3: Round to 2 decimal places
            df[col] = df[col].round(2)

            print(f"[LOG] {col} standardized: numeric, 2 decimal places")
        else:
            print(f"[LOG] '{col}' column not found, skipping")

    return df

def standardize_purchase_quantity(df):
    """Standardize Purchase Quantity to integer (NaN preserved)"""
    print("[LOG] Running standardize_purchase_quantity...")

    if "purchase quantity" in df.columns:
        # Remove non-numeric characters (like pcs, x, units, etc.)
        df["purchase quantity"] = (
            df["purchase quantity"]
            .astype(str)
            .str.replace(r"[^\d\.\-]", "", regex=True)  # keep digits only
        )

        # Convert to numeric (NaN for invalid)
        df["purchase quantity"] = pd.to_numeric(df["purchase quantity"], errors="coerce")

        # Round any decimals (e.g. 2.5 ‚Üí 2)
        df["purchase quantity"] = df["purchase quantity"].round(0).astype("Int64")

        print("[LOG] Purchase quantity standardized to integer format")
    else:
        print("[LOG] 'purchase quantity' column not found, skipping")

    return df

def standardize_transaction_method(df):
    """
    Standardize 'transaction method' into categories:
    ['Cash', 'Card', 'E-Wallet', 'Online Banking', 'Auto-Debit', 'Cheque']
    (Unknown represents missing values)
    """
    import re

    print("[LOG] Running standardize_transaction_method...")

    if "transaction method" not in df.columns:
        print("[LOG] 'transaction method' column not found, skipping.")
        return df

    # Step 1: Normalize text
    df["transaction method"] = (
        df["transaction method"]
        .astype(str)
        .str.lower()
        .str.strip()
    )

    # Step 2: Define patterns (non-capturing groups)
    patterns = {
        "Cash": r"\b(?:cash|tunai|otc|counter)\b",
        "Card": r"\b(?:card|visa|master|credit|debit|amex|credit.?debit)\b",
        "E-Wallet": r"\b(?:tng|touch\s*n\s*go|grab\s*pay|grabpay|boost|shopee\s*pay|shopeepay|spaylater|duitnow|ewallet|e-?wallet|qr|qr\s*pay|qrcode)\b",
        "Online Banking": r"\b(?:bank|transfer|fpx|online\s*payment|maybank2u|cimbclicks|duitnow\s*qr|public\s*bank)\b",
        "Auto-Debit": r"\b(?:auto.?debit|standing|recurring|subscription|auto\s*pay)\b",
        "Cheque": r"\b(?:cheque|cek|check)\b",
    }

    # Step 4: Apply vectorized regex matching
    for category, pattern in patterns.items():
        mask = df["transaction method"].str.contains(pattern, flags=re.IGNORECASE, na=False, regex=True)
        df.loc[mask, "transaction method"] = category
    
    # Step 5: Replace values that didn‚Äôt match anything
    valid_categories = list(patterns.keys())
    df.loc[~df["transaction method"].isin(valid_categories), "transaction method"] = "Unknown"

    print("[LOG] Transaction method standardized successfully.")
    return df

# ============================================= (ORDER DATASET) STAGE 4: MISSING VALUE HANDLING =============================================
def handle_missing_values_order(df):
    """
    Strategy (for SME context):
    - Drop rows if critical identifiers are missing (orderid, customerid, purchase date)
    - Drop rows if both item price and total spend are missing
    - Fill or infer non-critical missing fields logically:
        - purchase item: "Unknown Item"
        - purchase quantity: 1
        - item price: median price
        - total spend: item_price * quantity (if available)
        - transaction method: "Unknown"
        - purchase time: "Unknown"
    """

    initial_count = len(df)

    # Drop rows missing critical identifiers
    critical_cols = ["orderid", "customerid", "purchase date"]
    existing_critical = [c for c in critical_cols if c in df.columns]
    df = df.dropna(subset=existing_critical)
    print(f"[LOG] Dropped rows with missing critical identifiers: {initial_count - len(df)}")

    # Drop rows missing both financial info
    before_financial = len(df)
    df = df.dropna(subset=["item price", "total spend"], how="all")
    print(f"[LOG] Dropped {before_financial - len(df)} rows with no financial info")

    # Fill non-critical fields
    if "purchase item" in df.columns:
        df["purchase item"] = df["purchase item"].fillna("Unknown Item")

    if "purchase quantity" in df.columns:
        df["purchase quantity"] = df["purchase quantity"].fillna(1)

    # Handle item price (replace with median)
    if "item price" in df.columns:
        median_price = df["item price"].median(skipna=True)
        df["item price"] = df["item price"].fillna(median_price)

    # Handle total spend (calculate or fallback)
    if {"item price", "purchase quantity", "total spend"}.issubset(df.columns):
        mask_missing_total = df["total spend"].isna()
        df.loc[mask_missing_total, "total spend"] = (
            df.loc[mask_missing_total, "item price"] * df.loc[mask_missing_total, "purchase quantity"]
        )
        # Fill any still missing values with median
        median_total = df["total spend"].median(skipna=True)
        df["total spend"] = df["total spend"].fillna(median_total)

    # Transaction method ‚Üí Unknown
    if "transaction method" in df.columns:
        df["transaction method"] = df["transaction method"].replace(["", "NaN", None], np.nan)
        df["transaction method"] = df["transaction method"].fillna("Unknown")

    # Purchase time ‚Üí Unknown
    if "purchase time" in df.columns:
        df["purchase time"] = df["purchase time"].fillna("Unknown")

    # Final summary
    dropped_total = initial_count - len(df)
    print(f"[LOG] Dropped total {dropped_total} rows ({dropped_total/initial_count:.2%}) due to missing critical data")
    print(f"[LOG] Dataset now has {len(df)} rows after missing value handling")
    print("========== [STAGE 4 COMPLETE] ==========")

    return df

# ============================================= (ORDER DATASET) STAGE 5: OUTLIER DETECTION =============================================
def order_detect_outliers(df):
    """
    Stage 5: Handle outliers for order dataset.
    - Apply IQR method if dataset < 500 rows.
    - Apply percentile capping (1st‚Äì99th) if dataset >= 500 rows.
    - Columns: item price, purchase quantity, total spend.
    """
    print(f"[LOG] Dataset has {len(df)} rows")

    numeric_cols = ["purchase quantity", "total spend"]
    df = df.copy()
    
    for col in numeric_cols:
        if col not in df.columns:
            continue

        if df[col].dropna().empty:
            print(f"[WARN] {col} is empty or missing, skipping.")
            continue

        if len(df) < 500:
            # IQR Method
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            method = "IQR"
        else:
            # Percentile Capping
            lower = df[col].quantile(0.01)
            upper = df[col].quantile(0.99)
            method = "Percentile Capping"

        # Apply capping
        df[col] = df[col].clip(lower, upper)
        print(f"[LOG] {method} applied on '{col}', capped between [{lower:.2f}, {upper:.2f}]")

    print("‚úÖ [STAGE 5 COMPLETE] Outliers handled successfully.")
    return df



In [13]:
# ============================================= (ORDER DATASET) DATASET CLEANING PIPELINE =============================================
def clean_order_dataset(df, original_order_dataset_name):
    print("üöÄ Starting order data cleaning pipeline...\n")
    
    # =======================================================
    # STAGE 0: NORMALIZE COLUMN NAMES (FROM GENERIC FUNCTION)
    # =======================================================
    print("========== [STAGE 0 START] Normalize Column Names ==========")
    df = normalize_columns_name(df)
    print("‚úÖ [STAGE 0 COMPLETE] Column names normalized.\n")
    
    # =============================================
    # STAGE 1: SCHEMA & COLUMN VALIDATION
    # =============================================
    print("========== [STAGE 1 START] Schema & Column Validation ==========")
    order_mandatory = ["orderid", "customerid", "purchase item", "purchase date", "item price", "purchase quantity", "total spend", "transaction method"]
    df, message = check_mandatory_columns(df, dataset_type="order", mandatory_columns=order_mandatory)
    print(message)
    print("‚úÖ [STAGE 1 COMPLETE] Schema validation done.\n")
    
    # ============================================================
    # STAGE 2: REMOVE DUPLICATE ENTRY ROWS (FROM GENERIC FUNCTION)
    # ============================================================
    print("========== [STAGE 2 START] Remove Duplicate Entry Rows ==========")
    df = remove_duplicate_entries(df)
    print("‚úÖ [STAGE 2 COMPLETE] Duplicate entries removed.\n")
    
    # =============================================
    # STAGE 3: STANDARDIZATION & NORMALIZATION
    # =============================================
    print("========== [STAGE 3 START] Standardization & Normalization ==========")
    df = standardized_order_id(df)
    df = standardize_customer_id(df)
    df = standardize_purchase_item(df)
    df = standardize_purchase_date(df)
    df = standardized_item_price_and_total_spend(df)
    df = standardize_purchase_quantity(df)
    df = standardize_transaction_method(df)
    print("‚úÖ [STAGE 3 COMPLETE] Standardization and normalization finished.\n")
    
    print("========== [Standardized Order Dataset START] Save Standardized Dataset ==========")
    base_name, ext = os.path.splitext(original_order_dataset_name)
    cleaned_file = f"{base_name}_standardized{ext}"
    df.to_csv(cleaned_file, index=False, float_format="%.2f")
    print(f"‚úÖ [STANDARDIZED STAGE COMPLETE] Standardized Order dataset saved as: {cleaned_file}\n")
    
    # ===============================================
    # STAGE 4: MISSING VALUE HANDLING
    # ===============================================
    print("========== [STAGE 4 START] Missing Value Handling ==========")
    df = handle_missing_values_order(df)
    print("‚úÖ [STAGE 4 COMPLETE] Missing values handled.\n")
    
    # # =============================================
    # # STAGE 6: OUTLIER DETECTION
    # # =============================================
    print("========== [STAGE 5 START] Outlier Detection ==========")
    df = order_detect_outliers(df)   # make sure detect_outliers returns df
    print("‚úÖ [STAGE 5 COMPLETE] Outliers handled.\n")
    
    # =============================================
    # SAVE CLEANED DATASET
    # =============================================
    print("========== [FINAL STAGE START] Save Cleaned Dataset ==========")
    # base_name, ext = os.path.splitext(original_order_dataset_name)
    cleaned_file = f"{base_name}_cleaned{ext}"
    df.to_csv(cleaned_file, index=False)
    print(f"‚úÖ [FINAL STAGE COMPLETE] Cleaned dataset saved as: {cleaned_file}\n")

    print("==========================================================")
    print("üéâ Data cleaning pipeline completed successfully!\n")
    return df, cleaned_file
    # later add on return clean file name


In [14]:
# Keep a copy for comparison
original_order_df = order_df.copy()

# >>>>>>>>>>>>>>>>>>>> (ORDER DATASET) IMPLEMENT CLEANING PIPELINE >>>>>>>>>>>>>>>>>>>
# cleaned_order_df, cleaned_file_name = clean_order_dataset(order_df, original_order_dataset_name)
cleaned_order_df, cleaned_file_name = clean_order_dataset(order_df, original_order_dataset_name)

# AFTER CLEANING : CLEANED VERSION TO BE STORED BACK TO DATABASE
cleaned_order_df.head(20)

üöÄ Starting order data cleaning pipeline...

[LOG] Running normalize_columns_name...
[LOG] Columns after normalization: ['orderid', 'customerid', 'purchase item', 'purchase date', 'item price', 'purchase quantity', 'total spend', 'transaction method']
‚úÖ [STAGE 0 COMPLETE] Column names normalized.

[LOG] Running check_mandatory_columns for order dataset...
[LOG] Mandatory column 'orderid' fill ratio: 1.00
[LOG] Mandatory column 'customerid' fill ratio: 1.00
[LOG] Mandatory column 'purchase item' fill ratio: 0.92
[LOG] Mandatory column 'purchase date' fill ratio: 1.00
[LOG] Mandatory column 'item price' fill ratio: 0.92
[LOG] Mandatory column 'purchase quantity' fill ratio: 0.92
[LOG] Mandatory column 'total spend' fill ratio: 0.55
[LOG] Mandatory column 'transaction method' fill ratio: 0.67
Some key fields in the order dataset have a high number of missing values: total spend, transaction method. The system will continue cleaning and handle missing values automatically, but we STRON

  df.loc[:, "purchase datetime"] = pd.to_datetime(


[LOG] item price standardized: numeric, 2 decimal places
[LOG] total spend standardized: numeric, 2 decimal places
[LOG] Running standardize_purchase_quantity...
[LOG] Purchase quantity standardized to integer format
[LOG] Running standardize_transaction_method...
[LOG] Transaction method standardized successfully.
‚úÖ [STAGE 3 COMPLETE] Standardization and normalization finished.

‚úÖ [STANDARDIZED STAGE COMPLETE] Standardized Order dataset saved as: 2021 - 2025 Order_standardized.csv

[LOG] Dropped rows with missing critical identifiers: 0
[LOG] Dropped 407 rows with no financial info
[LOG] Dropped total 407 rows (7.10%) due to missing critical data
[LOG] Dataset now has 5328 rows after missing value handling
‚úÖ [STAGE 4 COMPLETE] Missing values handled.

[LOG] Dataset has 5328 rows
[LOG] Percentile Capping applied on 'purchase quantity', capped between [1.00, 6.00]
[LOG] Percentile Capping applied on 'total spend', capped between [11.90, 243.31]
‚úÖ [STAGE 5 COMPLETE] Outliers hand

Unnamed: 0,orderid,customerid,purchase item,purchase date,item price,purchase quantity,total spend,transaction method,purchase time
0,211103JN8114CU,CUST0001,Aloe Vera Homemade Soap | Acne-Prone Skin | Ba...,2021-11-03,18.0,1,42.03,E-Wallet,20:21:00
1,211103JN8114CU,CUST0001,Oats & Calendula Dandruff Shampoo Bar | Sulfat...,2021-11-03,23.0,1,23.0,Unknown,20:21:00
2,211104MMPQH994,CUST0002,Calendula Homemade Soap | Suitable For Eczema-...,2021-11-04,23.0,2,70.0,E-Wallet,15:17:00
3,211104MMPQH994,CUST0002,Green Tea & Mint Hair Loss Shampoo | Sulfate-F...,2021-11-04,24.0,1,24.0,Unknown,15:17:00
4,211104MSJ8BGNC,CUST0003,Green Tea & Mint Hair Loss Shampoo | Sulfate-F...,2021-11-04,24.0,1,26.0,E-Wallet,16:44:00
5,211104NAHE4EFA,CUST0004,Green Tea & Mint Hair Loss Shampoo | Sulfate-F...,2021-11-04,24.0,2,243.307,Card,21:47:00
6,211104NAHE4EFA,CUST0004,Green Tea & Mint Hair Loss Shampoo | Sulfate-F...,2021-11-04,24.0,2,48.0,Unknown,21:47:00
7,211104NAHE4EFA,CUST0004,Oats & Calendula Dandruff Shampoo Bar | Sulfat...,2021-11-04,23.0,2,46.0,Unknown,21:47:00
8,211105PGD0R3EG,CUST0005,Green Tea & Mint Hair Loss Shampoo | Sulfate-F...,2021-11-05,24.0,1,25.0,Card,09:05:00
10,211105QAU8FV5B,CUST0006,Green Tea & Mint Hair Loss Shampoo | Sulfate-F...,2021-11-05,24.0,1,45.0,E-Wallet,16:58:00
