In [1]:
# Importing all required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning

In [2]:
# Load the Data
df = pd.read_csv("../data/raw/ecommerce_data.csv")
df.head()

Unnamed: 0,Tenure,WarehouseToHome,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,DaySinceLastOrder,CashbackAmount,Churn
0,15.0,29.0,4,Laptop & Accessory,3,Single,2,0,7.0,143.32,0
1,7.0,25.0,4,Mobile,1,Married,2,0,7.0,129.29,0
2,27.0,13.0,3,Laptop & Accessory,1,Married,5,0,7.0,168.54,0
3,20.0,25.0,4,Fashion,3,Divorced,7,0,,230.27,0
4,30.0,15.0,4,Others,4,Single,8,0,8.0,322.17,0


In [3]:
# Null Values
print("Before Dropping : \n",df.isnull().sum())
df.dropna(inplace=True)

Before Dropping : 
 Tenure                      194
WarehouseToHome             169
NumberOfDeviceRegistered      0
PreferedOrderCat              0
SatisfactionScore             0
MaritalStatus                 0
NumberOfAddress               0
Complain                      0
DaySinceLastOrder           213
CashbackAmount                0
Churn                         0
dtype: int64


In [4]:
# Correcting Data Types
print("Before Changing: \n")
print(df.dtypes)

df = df.astype({
    'Tenure' : 'int',
    'WarehouseToHome' : 'int',
    'DaySinceLastOrder' : 'int'
})


print("\n\nAfter Changing: \n")
print(df.dtypes)

Before Changing: 

Tenure                      float64
WarehouseToHome             float64
NumberOfDeviceRegistered      int64
PreferedOrderCat             object
SatisfactionScore             int64
MaritalStatus                object
NumberOfAddress               int64
Complain                      int64
DaySinceLastOrder           float64
CashbackAmount              float64
Churn                         int64
dtype: object


After Changing: 

Tenure                        int64
WarehouseToHome               int64
NumberOfDeviceRegistered      int64
PreferedOrderCat             object
SatisfactionScore             int64
MaritalStatus                object
NumberOfAddress               int64
Complain                      int64
DaySinceLastOrder             int64
CashbackAmount              float64
Churn                         int64
dtype: object


## Feature Engineering

In [5]:
df['IsNewCustomer'] = df['Tenure'] <= 5

df['LowSatisfaction'] = df['SatisfactionScore'] <= 2

df['HighCashback'] = df['CashbackAmount'] > df['CashbackAmount'].median()

df['RecentlyActive'] = df['DaySinceLastOrder'] <= 3

for col in ['IsNewCustomer','LowSatisfaction','HighCashback','RecentlyActive']:
    df[col] = df[col].astype(int)

In [6]:
df.head()
print(df['PreferedOrderCat'].unique())

df['PreferedOrderCat'] = df['PreferedOrderCat'].replace('Mobile Phone','Mobile')

print(df['PreferedOrderCat'].unique())

['Laptop & Accessory' 'Mobile' 'Others' 'Mobile Phone' 'Fashion' 'Grocery']
['Laptop & Accessory' 'Mobile' 'Others' 'Fashion' 'Grocery']


In [7]:
# Remove Duplicates
print(df.duplicated().sum())

df.drop_duplicates(inplace=True)

print(df.duplicated().sum())


571
0


In [12]:
#Save Cleaned Dataset
df.to_csv("../data/processed/df_cleaned.csv",index=False)

In [8]:
# Encoding Categorical Features
print(df['MaritalStatus'].unique())

print(df['PreferedOrderCat'].unique())

cat_cols = ['MaritalStatus','PreferedOrderCat']

df_logistic = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df_tree = pd.get_dummies(df, columns=cat_cols, drop_first=False)

['Single' 'Married' 'Divorced']
['Laptop & Accessory' 'Mobile' 'Others' 'Fashion' 'Grocery']


In [13]:
# Save Individual Dataset after encoding
df_logistic.to_csv("../data/processed/df_logistic.csv",index=False)
df_tree.to_csv("../data/processed/df_tree.csv",index=False)