In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np

# Load the dataset
file_path = "D://DSML PRACTICAL//Datasets//Ecommerce Customers.csv"  # Replace with your actual file path
data = pd.read_csv(file_path)

# Step 1: Inspect the data
print("First few rows of the dataset:")
print(data.head())

# Step 2: Handle missing values (filling with mean for numeric columns)
numeric_cols = data.select_dtypes(include=['number']).columns
non_numeric_cols = data.select_dtypes(exclude=['number']).columns

# Step 3: Fill missing values for numeric columns with mean
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Step 4: Fill missing values for non-numeric columns with mode or other placeholder (e.g., empty string)
for col in non_numeric_cols:
    data[col] = data[col].fillna(data[col].mode()[0])  # Filling with mode, or use '' for empty string


# Step 5: Label Encoding for categorical columns (e.g., 'color')
label_encoder = LabelEncoder()

# Loop through all columns to label encode categorical variables
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

# Step 6: Remove Outliers Using IQR Method (0.25 and 0.75 quantiles)
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_columns:
    # Calculate the first (Q1) and third (Q3) quartiles
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter out rows where the values are outside the bounds
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

# Step 7: Scaling using Min-Max Scaler (normalizing data)
scaler = MinMaxScaler()

# Apply scaling to all numeric columns
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Step 8: Save the cleaned, encoded, scaled, and outlier-removed data
data.to_csv("Cleaned_Encoded_Scaled_NoOutliers_IQR.csv", index=False)

print("\nData cleaned, label encoded, outliers removed using IQR method, scaled, and saved.")


First few rows of the dataset:
                           Email  \
0      mstephenson@fernandez.com   
1              hduke@hotmail.com   
2               pallen@yahoo.com   
3        riverarebecca@gmail.com   
4  mstephens@davidson-herman.com   

                                             Address            Avatar  \
0       835 Frank Tunnel\nWrightmouth, MI 82180-9605            Violet   
1     4547 Archer Common\nDiazchester, CA 06566-8576         DarkGreen   
2  24645 Valerie Unions Suite 582\nCobbborough, D...            Bisque   
3   1414 David Throughway\nPort Jason, OH 22070-1220       SaddleBrown   
4  14023 Rodriguez Passage\nPort Jacobville, PR 3...  MediumAquaMarine   

   Avg. Session Length  Time on App  Time on Website  Length of Membership  \
0            34.497268    12.655651        39.577668              4.082621   
1            31.926272    11.109461        37.268959              2.664034   
2            33.000915    11.330278        37.110597              4.10454