In [3]:
import numpy as np
import pandas as pd
import warnings
import plotly.express as px
warnings.filterwarnings("ignore")
pd.set_option("display.float_format","{:.5f}".format)

In [4]:
df=pd.read_csv("e_commerce.csv")
#https://www.kaggle.com/datasets/prachi13/customer-analytics

In [5]:
df.columns,df.shape

(Index(['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
        'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
        'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms',
        'Reached.on.Time_Y.N'],
       dtype='object'),
 (10999, 12))

In [6]:
df.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
Reached.on.Time_Y.N    0
dtype: int64

There are no missing values

# Exploratory data analysis

In [7]:
df["Reached.on.Time_Y.N"].mean()

0.5966906082371125

The target variable is fairly balanced

In [8]:
px.bar(df,y="Reached.on.Time_Y.N",x="Warehouse_block",color="Warehouse_block")


Most products that were delevered on time came from warehouse block F. This is due to the fact that overall most items came from warehouse block F

In [9]:
df["Warehouse_block"].value_counts()

Warehouse_block
F    3666
D    1834
A    1833
B    1833
C    1833
Name: count, dtype: int64

In [10]:
df.groupby("Warehouse_block")["Reached.on.Time_Y.N"].mean()

Warehouse_block
A   0.58647
B   0.60229
C   0.59684
D   0.59760
F   0.59847
Name: Reached.on.Time_Y.N, dtype: float64

In terms of which warehouse block the products came in time, the is no significant difference

In [11]:
px.bar(df,y="Reached.on.Time_Y.N",x="Mode_of_Shipment",color="Mode_of_Shipment")

In [12]:
#Most shipments are transfered by ships 
df.groupby("Mode_of_Shipment")["Reached.on.Time_Y.N"].mean()

Mode_of_Shipment
Flight   0.60158
Road     0.58807
Ship     0.59756
Name: Reached.on.Time_Y.N, dtype: float64

In [13]:
# the mode of shipment seems to have no effect on whether the product arrived on time

In [14]:
df.groupby("Reached.on.Time_Y.N")["Weight_in_gms"].mean()

Reached.on.Time_Y.N
0   4168.66839
1   3272.64010
Name: Weight_in_gms, dtype: float64

In [15]:
#Products that arrived on time tended to be lighter 
px.box(df,x="Cost_of_the_Product",color="Reached.on.Time_Y.N")

Most of the products that arrived on time had a lower cost than those that arrived late

# Supervised learning


first we will scale our independent variables 

In [42]:
from sklearn.preprocessing import StandardScaler

In [53]:
#set y
y=df["Reached.on.Time_Y.N"]

In [54]:
X_numericals=df.drop(["Warehouse_block","Mode_of_Shipment",\
"Product_importance","Gender","ID","Reached.on.Time_Y.N"],axis=1)

In [55]:
X_numericals=StandardScaler().fit_transform(X_numericals)

In [56]:
X_numericals=pd.DataFrame(X_numericals)
X_numericals.columns=df.drop(["Warehouse_block","Mode_of_Shipment","Product_importance","Gender","ID","Reached.on.Time_Y.N"],axis=1).columns

In [57]:
X_dummies=df[["Warehouse_block","Mode_of_Shipment","Product_importance","Gender"]]
X_dummies=pd.get_dummies(X_dummies)

In [58]:
#bind both subsets of X
X=pd.concat([X_numericals,X_dummies],axis=1)

In [59]:
X.columns

Index(['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Discount_offered', 'Weight_in_gms',
       'Warehouse_block_A', 'Warehouse_block_B', 'Warehouse_block_C',
       'Warehouse_block_D', 'Warehouse_block_F', 'Mode_of_Shipment_Flight',
       'Mode_of_Shipment_Road', 'Mode_of_Shipment_Ship',
       'Product_importance_high', 'Product_importance_low',
       'Product_importance_medium', 'Gender_F', 'Gender_M'],
      dtype='object')

# SVM classification
We will train a simple SVM in the beggining with default parameters. Next we will use a random grid search. 

In [70]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score,classification_report

In [67]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1)

In [68]:
model=SVC()

In [73]:
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
accuracy_score(y_test,y_pred)

0.6632727272727272

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

We will use randomized grid search to reduce computing time

In [None]:
param_dist={
    "kernel":["linear","rbf","polynomial"],
    "C":uniform(0.1,10),
    "gamma":uniform(0.01,0.1),
    "degree":[2,3]
}

In [None]:
#low  cv for lower processing time
random_search = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=param_dist,
    n_iter=50,                      
    cv=2,                           
    verbose=2,
    random_state=1,
    n_jobs=-1,
    scoring="accuracy"                   
)

In [115]:
random_search.fit(X_train, y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


In [116]:
random_search.best_params_

{'C': 8.024035857960191, 'gamma': 0.012980135818239327, 'kernel': 'rbf'}

In [117]:
y_pred=random_search.best_estimator_.predict(X_test)

In [118]:
accuracy_score(y_test,y_pred)

0.6614545454545454

In [119]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.55      0.83      0.66      1100
           1       0.83      0.55      0.66      1650

    accuracy                           0.66      2750
   macro avg       0.69      0.69      0.66      2750
weighted avg       0.72      0.66      0.66      2750



The accuracy of the model is low (67%) but it's precision is good. This means that the current model minimizes the shipments wrongly predicted as "arriving on time" (False positives)

# Logistic regression

In [121]:
from sklearn.linear_model import LogisticRegression

In [123]:
 param_dist = {
    'C': uniform(loc=0.01, scale=10),               
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],
    'l1_ratio': uniform(0, 1)                       
}

In [126]:
random_search=RandomizedSearchCV(
    estimator=LogisticRegression(),
    param_distributions=param_dist,
    cv=5,
    n_iter=50
)

In [128]:
random_search.fit(X_train,y_train)

In [129]:
random_search.best_params_

{'C': 1.043982167432772,
 'l1_ratio': 0.8581182696655945,
 'penalty': 'l1',
 'solver': 'saga'}

In [130]:
model=random_search.best_estimator_

In [131]:
y_pred=model.predict(X_test)

In [132]:
accuracy_score(y_test,y_pred)

0.6345454545454545

In [133]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.54      0.58      0.56      1100
           1       0.70      0.67      0.69      1650

    accuracy                           0.63      2750
   macro avg       0.62      0.62      0.62      2750
weighted avg       0.64      0.63      0.64      2750



 still low accuracy  

 # Knn classifirer

In [None]:
from sklearn.neighbors import

In [134]:
from sklearn.neighbors import KNeighborsClassifier

In [135]:
param_dist={
    "n_neighbors": np.arange(1, 50, 2),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

In [140]:
random_search = RandomizedSearchCV(
    estimator=KNeighborsClassifier(),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring="accuracy",
)

In [141]:
random_search.fit(X_train, y_train)

In [142]:
random_search.best_params_

{'weights': 'distance', 'n_neighbors': 49, 'metric': 'manhattan'}

In [143]:
y_pred=random_search.best_estimator_.predict(X_test)  

In [144]:
accuracy_score(y_test,y_pred)

0.6516363636363637

In [145]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.55      0.71      0.62      1100
           1       0.76      0.61      0.68      1650

    accuracy                           0.65      2750
   macro avg       0.65      0.66      0.65      2750
weighted avg       0.68      0.65      0.66      2750



# Random Forest

In [150]:
from sklearn.ensemble import RandomForestClassifier
param_dist = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': np.arange(5, 20, 2),
    'min_samples_split': np.arange(2, 10, 1),
    'min_samples_leaf': np.arange(1, 5, 1),
    'bootstrap': [True, False]
}
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
    random_state=1
)

In [152]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [154]:
random_search.best_params_

{'n_estimators': 130,
 'min_samples_split': 2,
 'min_samples_leaf': 3,
 'max_depth': 7,
 'bootstrap': True}

In [155]:
y_pred=random_search.best_estimator_.predict(X_test)
accuracy_score(y_test,y_pred)

0.6792727272727273

In [157]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.56      0.90      0.69      1100
           1       0.89      0.53      0.66      1650

    accuracy                           0.68      2750
   macro avg       0.73      0.72      0.68      2750
weighted avg       0.76      0.68      0.68      2750



# Neural network classifier

In [164]:
from tensorflow import keras
from keras import layers

In [166]:
X_train.shape

(8249, 19)

In [172]:
model = keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(19,)),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='sigmoid')  #  binary classification
])

In [173]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [174]:
model.fit(X_train, y_train, epochs=10, batch_size=8)

Epoch 1/10
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.5905 - loss: 0.6068
Epoch 2/10
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6691 - loss: 0.5272
Epoch 3/10
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6680 - loss: 0.5215
Epoch 4/10
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6806 - loss: 0.5184
Epoch 5/10
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6779 - loss: 0.5188
Epoch 6/10
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6749 - loss: 0.5162
Epoch 7/10
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.6755 - loss: 0.5160
Epoch 8/10
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6805 - loss: 0.5110
Epoch 9/10
[1m1032/1032

<keras.src.callbacks.history.History at 0x2a74a7f49b0>

In [176]:
y_pred=model.predict(X_test)

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [178]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6536 - loss: 0.5058
Test loss: 0.5175766944885254
Test accuracy: 0.6578181982040405


Accuracy is still low

The best model in terms of precision is the random forest, we will see the feature importances of the random forest

In [180]:
best_model=random_search.best_estimator_
importances = best_model.feature_importances_
print(importances)

[0.02395016 0.0099352  0.06603899 0.04649975 0.4736793  0.34255977
 0.00277529 0.00251673 0.0023201  0.00298818 0.00361862 0.00299822
 0.00245695 0.00292736 0.00373914 0.00311162 0.00242779 0.00299259
 0.00246425]


In [184]:
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(feature_importance_df)

                      feature  importance
4            Discount_offered     0.47368
5               Weight_in_gms     0.34256
2         Cost_of_the_Product     0.06604
3             Prior_purchases     0.04650
0         Customer_care_calls     0.02395
1             Customer_rating     0.00994
14    Product_importance_high     0.00374
10          Warehouse_block_F     0.00362
15     Product_importance_low     0.00311
11    Mode_of_Shipment_Flight     0.00300
17                   Gender_F     0.00299
9           Warehouse_block_D     0.00299
13      Mode_of_Shipment_Ship     0.00293
6           Warehouse_block_A     0.00278
7           Warehouse_block_B     0.00252
18                   Gender_M     0.00246
12      Mode_of_Shipment_Road     0.00246
16  Product_importance_medium     0.00243
8           Warehouse_block_C     0.00232
