In [52]:
# Importing necessary libraries
# Data analysis and preprocessing
import pandas as pd 
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import mplfinance as mpf
from scipy import stats

# ML models
from sklearn.linear_model import LinearRegression

# Validation of models with metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error

In [53]:
df = pd.read_excel('E-Commerce-Dataset.xlsx', 1)
df

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,159.93
1,50002,1,,Phone,1,8.0,UPI,Male,3.0,4,Mobile,3,Single,7,1,15.0,0.0,1.0,0.0,120.90
2,50003,1,,Phone,1,30.0,Debit Card,Male,2.0,4,Mobile,3,Single,6,1,14.0,0.0,1.0,3.0,120.28
3,50004,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134.07
4,50005,1,0.0,Phone,1,12.0,CC,Male,,3,Mobile,5,Single,3,0,11.0,1.0,1.0,3.0,129.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5625,55626,0,10.0,Computer,1,30.0,Credit Card,Male,3.0,2,Laptop & Accessory,1,Married,6,0,18.0,1.0,2.0,4.0,150.71
5626,55627,0,13.0,Mobile Phone,1,13.0,Credit Card,Male,3.0,5,Fashion,5,Married,6,0,16.0,1.0,2.0,,224.91
5627,55628,0,1.0,Mobile Phone,1,11.0,Debit Card,Male,3.0,2,Laptop & Accessory,4,Married,3,1,21.0,1.0,2.0,4.0,186.42
5628,55629,0,23.0,Computer,3,9.0,Credit Card,Male,4.0,5,Laptop & Accessory,4,Married,4,0,15.0,2.0,2.0,9.0,178.90


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   Tenure                       5366 non-null   float64
 3   PreferredLoginDevice         5630 non-null   object 
 4   CityTier                     5630 non-null   int64  
 5   WarehouseToHome              5379 non-null   float64
 6   PreferredPaymentMode         5630 non-null   object 
 7   Gender                       5630 non-null   object 
 8   HourSpendOnApp               5375 non-null   float64
 9   NumberOfDeviceRegistered     5630 non-null   int64  
 10  PreferedOrderCat             5630 non-null   object 
 11  SatisfactionScore            5630 non-null   int64  
 12  MaritalStatus                5630 non-null   object 
 13  NumberOfAddress   

In [55]:
df.duplicated().sum()

np.int64(0)

In [56]:
df.isna().sum()

CustomerID                       0
Churn                            0
Tenure                         264
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                251
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 255
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    265
CouponUsed                     256
OrderCount                     258
DaySinceLastOrder              307
CashbackAmount                   0
dtype: int64

In [57]:
# Implementation of KNN Imputer
from sklearn.impute import KNNImputer
imp_knn = KNNImputer(n_neighbors = 5)

num_columns = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount', 'DaySinceLastOrder']

In [58]:
df[num_columns] = imp_knn.fit_transform(df[num_columns])
df.head()

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,159.93
1,50002,1,9.8,Phone,1,8.0,UPI,Male,3.0,4,Mobile,3,Single,7,1,15.0,0.0,1.0,0.0,120.9
2,50003,1,8.0,Phone,1,30.0,Debit Card,Male,2.0,4,Mobile,3,Single,6,1,14.0,0.0,1.0,3.0,120.28
3,50004,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134.07
4,50005,1,0.0,Phone,1,12.0,CC,Male,2.4,3,Mobile,5,Single,3,0,11.0,1.0,1.0,3.0,129.6


In [59]:
df.isna().sum()

CustomerID                     0
Churn                          0
Tenure                         0
PreferredLoginDevice           0
CityTier                       0
WarehouseToHome                0
PreferredPaymentMode           0
Gender                         0
HourSpendOnApp                 0
NumberOfDeviceRegistered       0
PreferedOrderCat               0
SatisfactionScore              0
MaritalStatus                  0
NumberOfAddress                0
Complain                       0
OrderAmountHikeFromlastYear    0
CouponUsed                     0
OrderCount                     0
DaySinceLastOrder              0
CashbackAmount                 0
dtype: int64

In [60]:
df['PreferredLoginDevice'].value_counts()

PreferredLoginDevice
Mobile Phone    2765
Computer        1634
Phone           1231
Name: count, dtype: int64

In [61]:
df['PreferredLoginDevice'] = [1 if i == 'Computer' else 0 for i in df['PreferredLoginDevice']]

In [62]:
df['PreferredLoginDevice'].value_counts()

PreferredLoginDevice
0    3996
1    1634
Name: count, dtype: int64

In [63]:
df['PreferredPaymentMode'].value_counts()

PreferredPaymentMode
Debit Card          2314
Credit Card         1501
E wallet             614
UPI                  414
COD                  365
CC                   273
Cash on Delivery     149
Name: count, dtype: int64

In [64]:
df = pd.concat([df, pd.get_dummies(data=df['PreferredPaymentMode'], prefix='PreferredPaymentMode', drop_first=True)], axis=1)
df.drop(columns=['PreferredPaymentMode'], inplace=True)

In [65]:
df['Gender'].value_counts()

Gender
Male      3384
Female    2246
Name: count, dtype: int64

In [66]:
df['Gender'] = [1 if i == 'Male' else 0 for i in df['Gender']]

In [67]:
df['PreferedOrderCat'].value_counts()

PreferedOrderCat
Laptop & Accessory    2050
Mobile Phone          1271
Fashion                826
Mobile                 809
Grocery                410
Others                 264
Name: count, dtype: int64

In [68]:
df = pd.concat([df, pd.get_dummies(data=df['PreferedOrderCat'], prefix='PreferedOrderCat', drop_first=True)], axis=1)
df.drop(columns=['PreferedOrderCat'], inplace=True)

In [69]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married     2986
Single      1796
Divorced     848
Name: count, dtype: int64

In [70]:
df = pd.concat([df, pd.get_dummies(data=df['MaritalStatus'], prefix='MaritalStatus', drop_first=True)], axis=1)
df.drop(columns=['MaritalStatus'], inplace=True)

In [71]:
df

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,Gender,HourSpendOnApp,NumberOfDeviceRegistered,SatisfactionScore,...,PreferredPaymentMode_Debit Card,PreferredPaymentMode_E wallet,PreferredPaymentMode_UPI,PreferedOrderCat_Grocery,PreferedOrderCat_Laptop & Accessory,PreferedOrderCat_Mobile,PreferedOrderCat_Mobile Phone,PreferedOrderCat_Others,MaritalStatus_Married,MaritalStatus_Single
0,50001,1,4.0,0,3,6.0,0,3.0,3,2,...,True,False,False,False,True,False,False,False,False,True
1,50002,1,9.8,0,1,8.0,1,3.0,4,3,...,False,False,True,False,False,True,False,False,False,True
2,50003,1,8.0,0,1,30.0,1,2.0,4,3,...,True,False,False,False,False,True,False,False,False,True
3,50004,1,0.0,0,3,15.0,1,2.0,4,5,...,True,False,False,False,True,False,False,False,False,True
4,50005,1,0.0,0,1,12.0,1,2.4,3,5,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5625,55626,0,10.0,1,1,30.0,1,3.0,2,1,...,False,False,False,False,True,False,False,False,True,False
5626,55627,0,13.0,0,1,13.0,1,3.0,5,5,...,False,False,False,False,False,False,False,False,True,False
5627,55628,0,1.0,0,1,11.0,1,3.0,2,4,...,True,False,False,False,True,False,False,False,True,False
5628,55629,0,23.0,1,3,9.0,1,4.0,5,4,...,False,False,False,False,True,False,False,False,True,False


In [72]:
df.corr()

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,Gender,HourSpendOnApp,NumberOfDeviceRegistered,SatisfactionScore,...,PreferredPaymentMode_Debit Card,PreferredPaymentMode_E wallet,PreferredPaymentMode_UPI,PreferedOrderCat_Grocery,PreferedOrderCat_Laptop & Accessory,PreferedOrderCat_Mobile,PreferedOrderCat_Mobile Phone,PreferedOrderCat_Others,MaritalStatus_Married,MaritalStatus_Single
CustomerID,1.0,-0.019083,0.038909,0.002508,0.003239,0.055734,0.004251,0.591668,0.411098,-0.033146,...,0.002147,-0.006279,0.009629,0.002085,0.009755,-0.354631,0.294849,-0.008253,0.180235,-0.008068
Churn,-0.019083,1.0,-0.338851,0.051099,0.084703,0.072672,0.029264,0.012162,0.107939,0.105481,...,-0.032453,0.055751,0.004163,-0.089575,-0.133353,0.113364,0.154387,-0.054903,-0.151024,0.180847
Tenure,0.038909,-0.338851,1.0,-0.043297,-0.050778,-0.021438,-0.045787,0.001676,-0.019119,-0.013007,...,0.017514,0.012687,-0.03841,0.347337,-0.047829,-0.175262,-0.234986,0.264163,0.081921,-0.118238
PreferredLoginDevice,0.002508,0.051099,-0.043297,1.0,-0.002476,0.025483,0.015871,-0.017105,0.021096,-0.036049,...,0.030548,-0.037917,-0.003232,-0.010534,0.04638,-0.014277,0.004789,-0.045577,0.002643,0.019098
CityTier,0.003239,0.084703,-0.050778,-0.002476,1.0,0.009708,-0.025176,-0.008429,0.027934,-0.011554,...,-0.118713,0.514227,-0.030518,0.00416,0.226587,-0.127602,-0.188046,-0.042998,-0.029922,0.020878
WarehouseToHome,0.055734,0.072672,-0.021438,0.025483,0.009708,1.0,-0.0045,0.058778,0.020264,0.007258,...,-0.021765,0.033142,-0.006961,0.030555,0.039513,-0.031198,-0.022667,-0.076716,0.025759,-0.021693
Gender,0.004251,0.029264,-0.045787,0.015871,-0.025176,-0.0045,1.0,-0.018963,-0.021799,-0.03522,...,0.00231,-0.0245,0.043301,-0.048071,-0.019739,0.017308,0.042552,0.033147,0.037225,-0.032308
HourSpendOnApp,0.591668,0.012162,0.001676,-0.017105,-0.008429,0.058778,-0.018963,1.0,0.311181,0.03469,...,0.020709,0.004164,-0.027247,-0.038603,-0.021929,-0.223542,0.246317,-0.019943,0.034111,-0.046512
NumberOfDeviceRegistered,0.411098,0.107939,-0.019119,0.021096,0.027934,0.020264,-0.021799,0.311181,1.0,-0.017228,...,-0.007516,0.025578,0.007815,-0.035706,-0.006282,-0.215316,0.168584,0.021423,-0.036259,0.043389
SatisfactionScore,-0.033146,0.105481,-0.013007,-0.036049,-0.011554,0.007258,-0.03522,0.03469,-0.017228,1.0,...,-0.050365,0.017752,0.008063,0.002287,-0.019501,0.005493,0.005577,0.001442,-0.023576,-0.019314


In [73]:
X = df.drop(columns=['Churn'])
y = df['Churn']

In [74]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.73174319, -0.72111489, -0.63945986, ..., -0.22180771,
        -1.06270855,  1.46107627],
       [-1.73112789, -0.0332012 , -0.63945986, ..., -0.22180771,
        -1.06270855,  1.46107627],
       [-1.7305126 , -0.24669165, -0.63945986, ..., -0.22180771,
        -1.06270855,  1.46107627],
       ...,
       [ 1.7305126 , -1.07693231, -0.63945986, ..., -0.22180771,
         0.94099177, -0.68442697],
       [ 1.73112789,  1.53239547,  1.56381982, ..., -0.22180771,
         0.94099177, -0.68442697],
       [ 1.73174319, -0.24669165, -0.63945986, ..., -0.22180771,
         0.94099177, -0.68442697]], shape=(5630, 29))

In [75]:
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=43)

In [80]:
from sklearn.tree import DecisionTreeClassifier

dc = DecisionTreeClassifier()
dc.fit(X = X_train, y = y_train)
y_train_pred = dc.predict(X = X_train)
y_test_pred = dc.predict(X = X_test)

In [None]:
pd.Series(y_test_pred)

0       0
1       0
2       0
3       0
4       0
       ..
1121    0
1122    0
1123    0
1124    0
1125    0
Length: 1126, dtype: int64

In [85]:
y_test

2299    0
1243    0
2345    0
1904    0
1835    0
       ..
2503    0
3514    0
4954    0
4047    0
2242    0
Name: Churn, Length: 1126, dtype: int64

In [89]:
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score

roc_auc_score(y_true = y_test, y_score = y_test_pred)
accuracy_score(y_true = y_test, y_pred = y_test_pred)

0.9484902309058615

In [78]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X = X_train, y = y_train)
y_train_pred = lr.predict(X = X_train)
y_test_pred = lr.predict(X = X_test)

In [79]:
print(f"MSE error in training data : {mean_squared_error(y_true = y_train, y_pred = y_train_pred)}")
print(f"MAE error in training data : {mean_absolute_error(y_true = y_train, y_pred = y_train_pred)}")
print(f"MSL error in training data : {mean_squared_log_error(y_true = y_train, y_pred = y_train_pred)}")

print(f"MSE error in test data : {mean_squared_error(y_true = y_test, y_pred = y_test_pred)}")
print(f"MAE error in test data : {mean_absolute_error(y_true = y_test, y_pred = y_test_pred)}")
print(f"MSL error in test data : {mean_squared_log_error(y_true = y_test, y_pred = y_test_pred)}")

MSE error in training data : 0.10031814230795633
MAE error in training data : 0.24148723403795894
MSL error in training data : 0.05554051418763844
MSE error in test data : 0.09642667712208224
MAE error in test data : 0.23860343633487788
MSL error in test data : 0.05376177117930589
