In [8]:
#StepImport required libraries
import pandas as pd
import numpy as np


In [9]:
#  Step 2: Load the dataset

df = pd.read_csv("Mall customer segmentation.csv")

# Show first few rows
df.head()


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [10]:
#  Step 3: Basic info about the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [11]:
#  Step 4: Check for missing values
df.isnull().sum()


CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [12]:
#  Step 5: Clean column names (uniform format)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("✅ Cleaned column names:", df.columns.tolist())


✅ Cleaned column names: ['customerid', 'gender', 'age', 'annual_income_(k$)', 'spending_score_(1-100)']


In [13]:
#  Step 6: Handle missing values
# Fill numeric columns with mean and categorical with mode

for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].mean(), inplace=True)

for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("✅ Missing values handled")


✅ Missing values handled


In [14]:
#  Step 7: Remove duplicate rows
before = len(df)
df.drop_duplicates(inplace=True)
after = len(df)
print(f"✅ Removed {before - after} duplicate rows")


✅ Removed 0 duplicate rows


In [15]:
# Step 8: Standardize text values (if any)
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.strip().str.title()
print("✅ Text values standardized")


✅ Text values standardized


In [16]:
#  Step 9: Fix data types
print("Before fixing:")
print(df.dtypes)

# Convert numeric columns if needed
cols_to_fix = ['age', 'annual_income_(k$)', 'spending_score_(1-100)']
for col in cols_to_fix:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print("\nAfter fixing:")
print(df.dtypes)


Before fixing:
customerid                 int64
gender                    object
age                        int64
annual_income_(k$)         int64
spending_score_(1-100)     int64
dtype: object

After fixing:
customerid                 int64
gender                    object
age                        int64
annual_income_(k$)         int64
spending_score_(1-100)     int64
dtype: object


In [17]:
#  Step 10: Handle any remaining NaN values
df.fillna(0, inplace=True)
print("✅ Remaining missing values filled")


✅ Remaining missing values filled


In [18]:
#  Step 11: Verify the cleaned dataset
print("✅ Shape:", df.shape)
df.head()


✅ Shape: (200, 5)


Unnamed: 0,customerid,gender,age,annual_income_(k$),spending_score_(1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [19]:
# Step 12: Save cleaned dataset
df.to_csv("cleaned_mall_customer_segmentation.csv", index=False)
print("💾 Cleaned dataset saved as 'cleaned_mall_customer_segmentation.csv'")


💾 Cleaned dataset saved as 'cleaned_mall_customer_segmentation.csv'


In [None]:
Short Summary of Changes

1)The Mall Customer Segmentation dataset was cleaned and preprocessed to ensure data quality and consistency.
2)Missing numeric values (like Age, Annual Income (k$), and Spending Score (1-100)) were filled with their mean values, while missing categorical entries were replaced with the most frequent values.
3)All duplicate rows were removed, and column names were standardized to lowercase with underscores.
4)Text data (such as Gender) was cleaned by trimming spaces and converting to title case.
5)Data types for numeric columns were corrected, and the cleaned dataset was saved as cleaned_mall_customer_segmentation.csv, ready for analysis.