In [6]:
#project title :- e-commerce customer dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv(r"C:\\Users\\ashis\\Downloads\\ecommerce_customers_large.csv")


# --------------------------
# 🔍 Step 1: Data Assessment
# --------------------------

print("Missing values:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())
print("\nData types:\n", df.dtypes)
print("\nUnique Genders:", df["Gender"].unique())
print("Unique Browser Types:", df["BrowserType"].unique())
print("Unique Device Types:", df["DeviceType"].unique())
print("\nSummary Statistics:\n", df.describe())

# -------------------------
# 🧹 Step 2: Data Cleaning
# -------------------------

# Remove duplicate rows
df = df.drop_duplicates()

# Standardize text format
df["Gender"] = df["Gender"].str.strip().str.capitalize()
df["BrowserType"] = df["BrowserType"].str.strip().str.capitalize()
df["DeviceType"] = df["DeviceType"].str.strip().str.capitalize()

# -------------------------------
# ⚠️ Step 3: Outlier Detection
# -------------------------------

# Function to detect outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

# Apply to relevant numeric columns
for col in ["AnnualIncome", "SpendingScore", "PurchaseFrequency", "LastPurchaseDaysAgo"]:
    df = remove_outliers_iqr(df, col)

# ------------------------------------
# 🛠️ Step 4: Feature Engineering
# ------------------------------------

# 1. Income per purchase (avoid division by zero)
df["IncomePerPurchase"] = df.apply(
    lambda row: row["AnnualIncome"] / row["PurchaseFrequency"] if row["PurchaseFrequency"] > 0 else 0,
    axis=1
)

# 2. Recent Customer (1 if purchased in the last 30 days)
df["RecentCustomer"] = df["LastPurchaseDaysAgo"].apply(lambda x: 1 if x <= 30 else 0)

# 3. Encode Gender
df["GenderEncoded"] = df["Gender"].map({"Male": 0, "Female": 1})

# 4. One-hot encode BrowserType and DeviceType
df = pd.get_dummies(df, columns=["BrowserType", "DeviceType"], drop_first=True)

# -------------------------------
# 📦 Final Output Summary
# -------------------------------

print("\nFinal dataset shape:", df.shape)
print("\nColumns in final dataset:\n", df.columns)

# Save cleaned and processed data
df.to_csv("cleaned_ecommerce_customer_data.csv", index=False)
print("\nCleaned data saved as 'cleaned_ecommerce_customer_data.csv'")


Missing values:
 CustomerID             0
Age                    0
Gender                 0
AnnualIncome           0
SpendingScore          0
BrowserType            0
PurchaseFrequency      0
LastPurchaseDaysAgo    0
DeviceType             0
dtype: int64

Duplicate rows: 0

Data types:
 CustomerID              int64
Age                     int64
Gender                 object
AnnualIncome            int64
SpendingScore           int64
BrowserType            object
PurchaseFrequency       int64
LastPurchaseDaysAgo     int64
DeviceType             object
dtype: object

Unique Genders: ['Male' 'Female']
Unique Browser Types: ['Edge' 'Opera' 'Chrome' 'Firefox' 'Safari']
Unique Device Types: ['Tablet' 'Mobile' 'Desktop']

Summary Statistics:
         CustomerID        Age   AnnualIncome  SpendingScore  \
count   120.000000  120.00000     120.000000     120.000000   
mean   1060.500000   42.67500   68623.966667      48.266667   
std      34.785054   13.20629   29992.299637      27.213360   
m