Dataset: E-Commerce Shipping Data from Kaggle: https://www.kaggle.com/prachi13/customer-analytics

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

ID: ID Number of Customers.

• Warehouse block: The Company has a big Warehouse which is divided into blocks A,B,C,D,E.

• Mode of shipment: The Company ships the products in multiple ways such as Ship, Flight and Road.

• Customer care calls: The number of calls made from enquiry for enquiry of the shipment.

• Customer rating: The company has rated from every customer. 1 is the lowest (Worst), 5 is the highest (Best).

• Cost of the product: Cost of the Product in US Dollars.

• Prior purchases: The Number of Prior Purchase.

• Product importance: The company has categorized the product in the various parameter such as low, medium, high.

• Gender: Male and Female.

• Discount offered: Discount offered on that specific product.

• Weight in gms: It is the weight in grams.

• Reached on time: It is the target variable, where 1 Indicates that the product has NOT reached on time and 0 
indicates it has reached on time.

# Wrangle Data

In [2]:
def wrangle(filepath):
    
    # Load data
    df = pd.read_csv(filepath, index_col="ID")
    
    # Switch binary values in target variable
    mask = df["Reached.on.Time_Y.N"] == 1
    df.loc[mask, "Reached_on_time"] = 0
    df.loc[~mask, "Reached_on_time"] = 1
    
    # Drop redundant columns
    df.drop(columns=["Reached.on.Time_Y.N"], inplace=True)
    
    return df

In [3]:
df = wrangle("shipping_data.csv")

In [4]:
df.head()

Unnamed: 0_level_0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached_on_time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,D,Flight,4,2,177,3,low,F,44,1233,0.0
2,F,Flight,4,5,216,2,low,M,59,3088,0.0
3,A,Flight,2,2,183,4,low,M,48,3374,0.0
4,B,Flight,3,3,176,4,medium,M,10,1177,0.0
5,C,Flight,2,2,184,3,medium,F,46,2484,0.0


In [5]:
df.shape

(10999, 11)

In [6]:
"""
1. Wrangle Data
2. Split Data
3. Establish Baseline
4. Build Model
5. Check Metrics
6. Tune Parameters
7. Communicate Results
"""

'\n1. Wrangle Data\n2. Split Data\n3. Establish Baseline\n4. Build Model\n5. Check Metrics\n6. Tune Parameters\n7. Communicate Results\n'

# Split Data

In [6]:
X = df.drop(columns="Reached_on_time")
y = df["Reached_on_time"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Establish Baseline

In [8]:
baseline_accuracy = y.value_counts(normalize=True).max()
baseline_accuracy

0.5966906082371125

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10999 entries, 1 to 10999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      10999 non-null  object 
 1   Mode_of_Shipment     10999 non-null  object 
 2   Customer_care_calls  10999 non-null  int64  
 3   Customer_rating      10999 non-null  int64  
 4   Cost_of_the_Product  10999 non-null  int64  
 5   Prior_purchases      10999 non-null  int64  
 6   Product_importance   10999 non-null  object 
 7   Gender               10999 non-null  object 
 8   Discount_offered     10999 non-null  int64  
 9   Weight_in_gms        10999 non-null  int64  
 10  Reached_on_time      10999 non-null  float64
dtypes: float64(1), int64(6), object(4)
memory usage: 1.0+ MB


# Build Model

In [10]:
model_lr = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    StandardScaler(),
    LogisticRegression()
)

model_lr.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['Warehouse_block', 'Mode_of_Shipment',
                                     'Product_importance', 'Gender'],
                               use_cat_names=True)),
                ('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [11]:
model_rfc = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)

model_rfc.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['Warehouse_block', 'Mode_of_Shipment',
                                      'Product_importance', 'Gender'],
                                mapping=[{'col': 'Warehouse_block',
                                          'data_type': dtype('O'),
                                          'mapping': F      1
A      2
D      3
C      4
B      5
NaN   -2
dtype: int64},
                                         {'col': 'Mode_of_Shipment',
                                          'data_type': dtype('O'),
                                          'mapping': Ship      1
Flight    2
Road      3
NaN      -2
dtype: int64},
                                         {'col': 'Product_importance',
                                          'data_type': dtype('O'),
                                          'mapping': low       1
medium    2
high      3
NaN      -2
dtype: int64},
                                         {'col': 'Gend

In [12]:
model_gbc = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    GradientBoostingClassifier(random_state=42)
)

model_gbc.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['Warehouse_block', 'Mode_of_Shipment',
                                      'Product_importance', 'Gender'],
                                mapping=[{'col': 'Warehouse_block',
                                          'data_type': dtype('O'),
                                          'mapping': F      1
A      2
D      3
C      4
B      5
NaN   -2
dtype: int64},
                                         {'col': 'Mode_of_Shipment',
                                          'data_type': dtype('O'),
                                          'mapping': Ship      1
Flight    2
Road      3
NaN      -2
dtype: int64},
                                         {'col': 'Product_importance',
                                          'data_type': dtype('O'),
                                          'mapping': low       1
medium    2
high      3
NaN      -2
dtype: int64},
                                         {'col': 'Gend

# Check Metrics

In [13]:
print("Logistic Regression Accuracy (Train): ", model_lr.score(X_train, y_train))
print("Logistic Regression Accuracy (Test): ", model_lr.score(X_test, y_test))

Logistic Regression Accuracy (Train):  0.6407546312080918
Logistic Regression Accuracy (Test):  0.644090909090909


In [14]:
print("Random Forest Accuracy (Train): ", model_rfc.score(X_train, y_train))
print("Random Forest Accuracy (Test): ", model_rfc.score(X_test, y_test))

Random Forest Accuracy (Train):  1.0
Random Forest Accuracy (Test):  0.6631818181818182


In [15]:
print("Gradient Boosting Accuracy (Train): ", model_gbc.score(X_train, y_train))
print("Gradient Boosting Accuracy (Test): ", model_gbc.score(X_test, y_test))

Gradient Boosting Accuracy (Train):  0.7140584157290601
Gradient Boosting Accuracy (Test):  0.6836363636363636


# Tune Parameters

• permutation importance

In [18]:
param_grid = {
    "gradientboostingclassifier__loss": ["deviance", "exponential"],
    "gradientboostingclassifier__n_estimators": [100, 200, 300],
    "gradientboostingclassifier__criterion": ["friedman_mse", "mse"],
    "gradientboostingclassifier__max_depth": [3, 5, 8, 10]
}

In [19]:
# RandomizedSearchCV

model_gbc_rs = RandomizedSearchCV(model_gbc,
                                  param_distributions=param_grid,
                                  n_jobs=-1, 
                                  cv=3, 
                                  verbose=5)

model_gbc_rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  elif pd.api.types.is_categorical(cols):


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=['Warehouse_block',
                                                                   'Mode_of_Shipment',
                                                                   'Product_importance',
                                                                   'Gender'],
                                                             mapping=[{'col': 'Warehouse_block',
                                                                       'data_type': dtype('O'),
                                                                       'mapping': F      1
A      2
D      3
C      4
B      5
NaN   -2
dtype: int64},
                                                                      {'col': 'Mode_of_Shipment',
                                                                       'data_type': dtype('O'),
                                 

In [20]:
model_gbc_rs.best_params_

{'gradientboostingclassifier__n_estimators': 100,
 'gradientboostingclassifier__max_depth': 3,
 'gradientboostingclassifier__loss': 'deviance',
 'gradientboostingclassifier__criterion': 'mse'}

In [21]:
best_model_rs = model_gbc_rs.best_estimator_

In [22]:
# GridSearchCV

model_gbc_gs = GridSearchCV(model_gbc,
                            param_grid=param_grid,
                            n_jobs=-1,
                            cv=3,
                            verbose=5)

model_gbc_gs.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


  elif pd.api.types.is_categorical(cols):


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('ordinalencoder',
                                        OrdinalEncoder(cols=['Warehouse_block',
                                                             'Mode_of_Shipment',
                                                             'Product_importance',
                                                             'Gender'],
                                                       mapping=[{'col': 'Warehouse_block',
                                                                 'data_type': dtype('O'),
                                                                 'mapping': F      1
A      2
D      3
C      4
B      5
NaN   -2
dtype: int64},
                                                                {'col': 'Mode_of_Shipment',
                                                                 'data_type': dtype('O'),
                                                                 'mapping': Ship      1
Flight    2

In [23]:
model_gbc_gs.best_params_

{'gradientboostingclassifier__criterion': 'friedman_mse',
 'gradientboostingclassifier__loss': 'exponential',
 'gradientboostingclassifier__max_depth': 3,
 'gradientboostingclassifier__n_estimators': 100}

In [24]:
best_model_gs = model_gbc_gs.best_estimator_

In [25]:
# Scores before Tuning
print("Gradient Boosting Accuracy (Train): ", model_gbc.score(X_train, y_train))
print("Gradient Boosting Accuracy (Test): ", model_gbc.score(X_test, y_test))

Gradient Boosting Accuracy (Train):  0.7140584157290601
Gradient Boosting Accuracy (Test):  0.6836363636363636


In [26]:
# Scores after RandomizedSearchCV
print("Gradient Boosting Accuracy (Train): ", best_model_rs.score(X_train, y_train))
print("Gradient Boosting Accuracy (Test): ", best_model_rs.score(X_test, y_test))

Gradient Boosting Accuracy (Train):  0.7140584157290601
Gradient Boosting Accuracy (Test):  0.6836363636363636


In [27]:
# Scores after GridSearchCV
print("Gradient Boosting Accuracy (Train): ", best_model_gs.score(X_train, y_train))
print("Gradient Boosting Accuracy (Test): ", best_model_gs.score(X_test, y_test))

Gradient Boosting Accuracy (Train):  0.7136038186157518
Gradient Boosting Accuracy (Test):  0.6859090909090909


# Communicate Results