# 02 - Delivery Delay Prediction Model

## ðŸ“Œ Objective
Build and evaluate machine learning models to predict whether an order will be delayed (`Is_Delayed`).

---


In [1]:
import joblib


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib
from pathlib import Path

# --------------------------
# LOAD DATA
# --------------------------
df = pd.read_csv("../data/cleaned_amazon_delivery.csv")

print("Columns:", df.columns.tolist())
print(df.head())

# --------------------------
# FEATURES AND TARGET
# --------------------------
numeric_cols = ["Agent_Age", "Agent_Rating"]
categorical_cols = ["Weather", "Traffic", "Vehicle", "Area", "Category"]

X = df[numeric_cols + categorical_cols]
y = df["Is_Delayed"]

# --------------------------
# TRAIN-TEST SPLIT
# --------------------------
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# PREPROCESSOR
# --------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ]
)

# --------------------------
# PIPELINE (FINAL)
# --------------------------
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss"
    ))
])

# --------------------------
# TRAIN MODEL
# --------------------------
pipeline.fit(X_train, Y_train)

print("Training complete!")

# --------------------------
# SAVE FINAL MODEL (CLOUD SAFE)
# --------------------------
models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

save_path = models_dir / "DELAY_MODEL_FINAL_CLOUD.pkl"

joblib.dump(
    pipeline,
    save_path,
    compress=3,
    protocol=4
)

print("MODEL SAVED AT:", save_path)



Columns: ['Order_ID', 'Agent_Age', 'Agent_Rating', 'Store_Latitude', 'Store_Longitude', 'Drop_Latitude', 'Drop_Longitude', 'Order_Date', 'Order_Time', 'Pickup_Time', 'Weather', 'Traffic', 'Vehicle', 'Area', 'Delivery_Time', 'Category', 'Delivery_Duration_Minutes', 'Is_Delayed', 'Churned']
        Order_ID  Agent_Age  Agent_Rating  Store_Latitude  Store_Longitude  \
0  ialx566343618         37           4.9       22.745049        75.892471   
1  akqg208421122         34           4.5       12.913041        77.683237   
2  njpu434582536         23           4.4       12.914264        77.678400   
3  rjto796129700         38           4.7       11.003669        76.976494   
4  zguw716275638         32           4.6       12.972793        80.249982   

   Drop_Latitude  Drop_Longitude  Order_Date Order_Time          Pickup_Time  \
0      22.765049       75.912471  2022-03-19   11:30:00  2022-03-19 11:45:00   
1      13.043041       77.813237  2022-03-25   19:45:00  2022-03-25 19:50:00   
2

In [5]:
import joblib
loaded_pipeline = joblib.load("../models/DELAY_MODEL_FINAL.pkl")

xgb_model = loaded_pipeline.named_steps["model"]
xgb_model.save_model("../models/DELAY_MODEL_FINAL.json")

print("DONE")


DONE


In [6]:
loaded_pipeline.named_steps


{'preprocessor': ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                  ['Weather', 'Traffic', 'Vehicle', 'Area',
                                   'Category']),
                                 ('num', StandardScaler(),
                                  ['Agent_Age', 'Agent_Rating'])]),
 'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, feature_weights=None, gamma=None,
               grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.1, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=6, max_leaves=None,
               min_child_weight=None, missing

In [7]:
import joblib
loaded_pipeline = joblib.load("../models/DELAY_MODEL_FINAL.pkl")

booster = loaded_pipeline.named_steps["model"].get_booster()
booster.save_model("../models/DELAY_MODEL_FINAL_BOOSTER.json")

print("done")


done


In [3]:
# ==== EXTRACT PREPROCESSOR AND MODEL ====

pre = pipeline.named_steps['preprocessor']
xgb_clf = pipeline.named_steps['model']

from pathlib import Path
import joblib

models_dir = Path("../models")

# Save preprocessor
joblib.dump(pre, models_dir / "PREPROCESSOR.pkl", protocol=4)

# Save booster model only
xgb_clf.get_booster().save_model(str(models_dir / "XGBMODEL.json"))

print("DONE âœ”")


DONE âœ”


In [4]:
joblib.dump(pre, models_dir / "PREPROCESSOR.pkl", protocol=4)
xgb_clf.get_booster().save_model(models_dir / "XGBMODEL.json")
