In [2]:
# Importing all required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning

In [3]:
# Load the Data
df = pd.read_csv("../data/raw/ecommerce_data.csv")
df.head()

Unnamed: 0,Tenure,WarehouseToHome,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,DaySinceLastOrder,CashbackAmount,Churn
0,15.0,29.0,4,Laptop & Accessory,3,Single,2,0,7.0,143.32,0
1,7.0,25.0,4,Mobile,1,Married,2,0,7.0,129.29,0
2,27.0,13.0,3,Laptop & Accessory,1,Married,5,0,7.0,168.54,0
3,20.0,25.0,4,Fashion,3,Divorced,7,0,,230.27,0
4,30.0,15.0,4,Others,4,Single,8,0,8.0,322.17,0


In [4]:
# Null Values
print("Before Dropping : \n",df.isnull().sum())
df.dropna(inplace=True)

Before Dropping : 
 Tenure                      194
WarehouseToHome             169
NumberOfDeviceRegistered      0
PreferedOrderCat              0
SatisfactionScore             0
MaritalStatus                 0
NumberOfAddress               0
Complain                      0
DaySinceLastOrder           213
CashbackAmount                0
Churn                         0
dtype: int64


In [5]:
# Correcting Data Types
print("Before Changing: \n")
print(df.dtypes)

df = df.astype({
    'Tenure' : 'int',
    'WarehouseToHome' : 'int',
    'DaySinceLastOrder' : 'int'
})


print("\n\nAfter Changing: \n")
print(df.dtypes)

Before Changing: 

Tenure                      float64
WarehouseToHome             float64
NumberOfDeviceRegistered      int64
PreferedOrderCat             object
SatisfactionScore             int64
MaritalStatus                object
NumberOfAddress               int64
Complain                      int64
DaySinceLastOrder           float64
CashbackAmount              float64
Churn                         int64
dtype: object


After Changing: 

Tenure                        int64
WarehouseToHome               int64
NumberOfDeviceRegistered      int64
PreferedOrderCat             object
SatisfactionScore             int64
MaritalStatus                object
NumberOfAddress               int64
Complain                      int64
DaySinceLastOrder             int64
CashbackAmount              float64
Churn                         int64
dtype: object


## Feature Engineering

In [6]:
df['IsNewCustomer'] = df['Tenure'] <= 5

df['LowSatisfaction'] = df['SatisfactionScore'] <= 2

df['HighCashback'] = df['CashbackAmount'] > df['CashbackAmount'].median()

df['RecentlyActive'] = df['DaySinceLastOrder'] <= 3

for col in ['IsNewCustomer','LowSatisfaction','HighCashback','RecentlyActive']:
    df[col] = df[col].astype(int)

In [7]:
df.head()
print(df['PreferedOrderCat'].unique())

df['PreferedOrderCat'] = df['PreferedOrderCat'].replace('Mobile Phone','Mobile')

print(df['PreferedOrderCat'].unique())

['Laptop & Accessory' 'Mobile' 'Others' 'Mobile Phone' 'Fashion' 'Grocery']
['Laptop & Accessory' 'Mobile' 'Others' 'Fashion' 'Grocery']


In [8]:
# Remove Duplicates
print(df.duplicated().sum())

df.drop_duplicates(inplace=True)

print(df.duplicated().sum())


571
0


In [9]:
#Save Cleaned Dataset
df.to_csv("../data/processed/df_cleaned.csv",index=False)

In [10]:
# Encoding Categorical Features
print(df['MaritalStatus'].unique())

print(df['PreferedOrderCat'].unique())

cat_cols = ['MaritalStatus','PreferedOrderCat']

df_logistic = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df_tree = pd.get_dummies(df, columns=cat_cols, drop_first=False)

['Single' 'Married' 'Divorced']
['Laptop & Accessory' 'Mobile' 'Others' 'Fashion' 'Grocery']


In [11]:
# Save Individual Dataset after encoding
df_logistic.to_csv("../data/processed/df_logistic.csv",index=False)
df_tree.to_csv("../data/processed/df_tree.csv",index=False)

# Modelling

In [12]:
# Train test split

x_log = df_logistic.drop('Churn', axis=1)
y_log = df_logistic['Churn']
x_log_train, x_log_test, y_log_train, y_log_test = train_test_split(x_log,y_log, test_size=0.2, random_state=42)

x_tree = df_tree.drop('Churn',axis=1)
y_tree = df_tree['Churn']
x_tree_train, x_tree_test, y_tree_train, y_tree_test = train_test_split(x_tree, y_tree, test_size=0.2, random_state=42)

In [13]:
# Logistic Model
log_model = LogisticRegression(max_iter=1000)

log_model.fit(x_log_train,y_log_train)

y_log_pred = log_model.predict(x_log_test)


print("Logistic Regression Result : ")
print(classification_report(y_log_test,y_log_pred))
print('Confusion Matrix : \n',confusion_matrix(y_log_test,y_log_pred))
print('Accuracy Score : ',accuracy_score(y_log_test,y_log_pred))

Logistic Regression Result : 
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       478
           1       0.75      0.51      0.60        81

    accuracy                           0.90       559
   macro avg       0.83      0.74      0.77       559
weighted avg       0.90      0.90      0.90       559

Confusion Matrix : 
 [[464  14]
 [ 40  41]]
Accuracy Score :  0.9033989266547406


In [14]:
# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(x_tree_train,y_tree_train)

y_rf_pred = rf_model.predict(x_tree_test)


print("Random Forest Result : ")
print(classification_report(y_tree_test,y_rf_pred))
print('Confusion Matrix : \n',confusion_matrix(y_tree_test,y_rf_pred))
print('Accuracy Score : ',accuracy_score(y_tree_test,y_rf_pred))

Random Forest Result : 
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       478
           1       0.79      0.54      0.64        81

    accuracy                           0.91       559
   macro avg       0.86      0.76      0.80       559
weighted avg       0.91      0.91      0.91       559

Confusion Matrix : 
 [[466  12]
 [ 37  44]]
Accuracy Score :  0.9123434704830053


In [23]:
# from sklearn.model_selection import GridSearchCV

# params = {
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators':[100,200],
#     'min_child_weigth':[1,3],
#     'scale_pos_weight':[5,7,10]
# }

# grid = GridSearchCV(
#     estimator= XGBClassifier(eval_metric='logloss', random_state=42),
#     param_grid=params,
#     scoring='f1',
#     cv=3
# )
# grid.fit(x_tree_train,y_tree_train)

# print("Best Parameters : ",grid.best_params_)
# {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weigth': 1, 'n_estimators': 200, 'scale_pos_weight': 5}

In [32]:
# XGBoost Model
xgb_model = XGBClassifier(
    learning_rate= 0.1,
    eval_metric='logloss',
    max_depth= 5,
    min_child_weight= 1,
    n_estimators= 200,
    scale_pos_weight=5,
    random_state=42
)

xgb_model.fit(x_tree_train,y_tree_train)

y_xgb_pred = xgb_model.predict(x_tree_test)

y_probs = xgb_model.predict_proba(x_tree_test)[:, 1]

threshold = 0.6
y_custom_pred = (y_probs >= threshold).astype(int)

print("XGBoost Model Result : ")
print(classification_report(y_tree_test, y_custom_pred))
print('Confusion Matrix : \n',confusion_matrix(y_tree_test,y_custom_pred))
print('Accuracy Score : ',accuracy_score(y_tree_test,y_custom_pred))


XGBoost Model Result : 
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       478
           1       0.72      0.72      0.72        81

    accuracy                           0.92       559
   macro avg       0.83      0.83      0.83       559
weighted avg       0.92      0.92      0.92       559

Confusion Matrix : 
 [[455  23]
 [ 23  58]]
Accuracy Score :  0.9177101967799642


In [40]:
# Saving the results
results_df = pd.DataFrame({
    'Actual': y_tree_test,
    'Predicted': y_custom_pred,
    'Probabilty': y_probs
})

results_df.to_csv('../results/xgb_predictions.csv', index=False)

In [43]:
# Save Model
import joblib
joblib.dump(xgb_model,'../models/final_xgb_model.pkl')

['../models/final_xgb_model.pkl']